コード例 #1
0
ファイル: index.py プロジェクト: chen33/nlp
def getData():
	train_data= load_files('training')    
	test_data=load_files("test")
	count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
	doc_train=count_Vec.fit_transform(train_data.data)
	doc_test=count_Vec.transform(test_data.data)# ! there is transform not fit_transform
	return doc_train.toarray(),train_data.target,doc_test.toarray()
コード例 #2
0
ファイル: LR.py プロジェクト: sazlin/reTOracle
def export_classifier():
    #note that this data is not in the git repo
    train_small = load_files('./training_data/')
    test_small = load_files('./test_data/')

    # Turn the text documents into vectors of word frequencies
    vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 2),
                                 stop_words='english',
                                 strip_accents='ascii')
    X_train = vectorizer.fit_transform(train_small.data)
    y_train = train_small.target

    # Fit a classifier on the training set
    classifier = LogisticRegression(penalty='l2', tol=0.0001, C=1.0,
                                    fit_intercept=True, intercept_scaling=1,
                                    ).fit(X_train, y_train)
    print("Training score: {0:.1f}%".format(
        classifier.score(X_train, y_train) * 100))

    # Evaluate the classifier on the testing set
    X_test = vectorizer.transform(test_small.data)
    y_test = test_small.target
    print("Testing score: {0:.1f}%".format(
        classifier.score(X_test, y_test) * 100))
    export_pickle('LRclassifier.txt', classifier)
    export_pickle('LRvectorizer.txt', vectorizer)
コード例 #3
0
ファイル: lr.py プロジェクト: chen33/nlp
def getData():
	train_data= load_files('dataset/train')    
	test_data=load_files("dataset/test")
	count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
	doc_train=count_Vec.fit_transform(train_data.data)
	doc_test=count_Vec.transform(test_data.data)
	return doc_train.toarray(),train_data.target,doc_test.toarray(),test_data.target
コード例 #4
0
def createDataSet(train_path,test_path,category,k):
	"""
	create vectorized text feature
    '0' refer to 'atheism'
    '1' refer to 'sports'

	"""
	train_set = datasets.load_files(train_path,categories=category, 
	load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0)

	count_vect = CountVectorizer(encoding = 'utf-8',lowercase = True,
	 decode_error = 'ignore',  analyzer = 'word', ngram_range = (2,4),min_df = 1)
	
	tfidf_vecter = TfidfVectorizer( max_df = 0.8, stop_words = 'english')

	test_set = datasets.load_files(test_path,categories=category, 
	load_content=True, shuffle=True, encoding='utf-8',  decode_error='ignore', random_state=0)

	

	X_train_tfidf = tfidf_vecter.fit_transform(train_set.data)
	X_train_counts = count_vect.fit_transform(train_set.data)

	X_test_tfidf = tfidf_vecter.transform(test_set.data)
	X_test_counts = count_vect.transform(test_set.data)


	 
	for i in range(X_train_counts.shape[0]):
		if train_set.target[i] == k:
			train_set.target[i] = 1
		else:
			train_set.target[i] = -1

	for i in range(X_test_counts.shape[0]):
		if test_set.target[i] == k:
			test_set.target[i] = 1
		else:
			test_set.target[i] = -1

	
	
	#X_train_normalize = preprocessing.normalize(X_train_counts, norm = 'l2')
	



	#print train_set.target_names
	#print train_set.target
	#print size 
	#print len(train_set.target)


	#print X_train_tfidf.shape
	#print X_train_counts
	#print X_train_normalize


	return X_train_counts, train_set.target, X_train_counts.shape,X_test_counts, test_set.target, X_test_counts.shape
コード例 #5
0
def load(dataset, categories):
    if dataset == 'full':
        train = load_files('aclImdb/aggregate/', categories=categories)
        return train

    elif dataset == 'split':    
        train = load_files('aclImdb/train/', categories=categories)
        test = load_files('aclImdb/test/', categories=categories)
        return (train, test)
コード例 #6
0
ファイル: Homework_1.py プロジェクト: zoezou2015/ML_hm1
def vector_for_input_binary(train_file_path="/mnt/hgfs/temp/machine learning/train",
                            test_file_path="/mnt/hgfs/temp/machine learning/test", categories=None):
    train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
    test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')

    vectorized = feature_extraction.CountVectorizer(min_df=1, binary=True)
    train_input = vectorized.fit_transform(train_data['data'])
    test_input = vectorized.transform(test_data['data'])

    return train_input, train_data['target'], test_input, test_data['target']
コード例 #7
0
def test_grid_search_cv_on_newsgroup():
    ## load news group data
    categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]
    twenty_train_small = load_files('./data/20news-bydate-train/',
        categories=categories, charset='latin-1')
    twenty_test_small = load_files('./data/20news-bydate-test/',
        categories=categories, charset='latin-1')
    ## model pipeline using tfidf and passive aggresive
    pipeline = Pipeline((
        ('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
        ('clf', PassiveAggressiveClassifier(C=1)),
    ))
    param_grid = {
        'vec__min_df': [1, 2],
        'vec__max_df': [0.8, 1.0],
        'vec__ngram_range': [(1, 1), (1, 2)],
        'vec__use_idf': [True, False]
    }
    X, y = twenty_train_small.data, twenty_train_small.target
    ## cross validation on n_iter = 5
    grid_searcher = meta_search.GridSearch()
    # persist only once
    grid_searcher.persist_cv_splits('text_classification', X, y, './tmp/')
    grid_searcher.search(pipeline, param_grid)
    import time
    while not grid_searcher.isready():
        print time.sleep(2)
        print 'progress:', grid_searcher.progress()
        print 'best result:', grid_searcher.best_params_so_far()
        if grid_searcher.best_params_so_far():
            pass#grid_searcher.abort()
    print len(grid_searcher.partial_result())
    ## run again with naive bayesian
    ## no need to persist_cv_splits
    pipeline = Pipeline((
        ('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
        ('clf', MultinomialNB()),
    ))
    grid_searcher10 = meta_search.GridSearch(datafiles = grid_searcher.datafiles)
    grid_searcher10.search(pipeline, param_grid)
    while not grid_searcher10.isready():
        print time.sleep(2)
        print 'progress:', grid_searcher10.progress()
        print 'best result:', grid_searcher10.best_params_so_far()
        if grid_searcher10.best_params_so_far():
            pass#grid_searcher10.abort()
    print len(grid_searcher10.partial_result())    
コード例 #8
0
ファイル: test.py プロジェクト: titopsur/python_test
def main():
    #buildTrainSet()
    #buildTestSet()
    train = load_files('model/train', encoding='utf-8')
    test = load_files('model/test', encoding='utf-8')
    print train.cc
#    for l in train.target_names:
#        print l
#    for l in train.target:
#        print l
    vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
    X_train = vectorizer.fit(train)
    X_test = vectorizer.fit_transform(test)
    print X_train.get_feature_names()
コード例 #9
0
ファイル: Homework_1.py プロジェクト: zoezou2015/ML_hm1
def vector_for_input(train_file_path=path1,
                     test_file_path=path2, categories=None):
    train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
    test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')

    # vectorized_normalized = feature_extraction.TfidfVectorizer(min_df=1)
    # train_input_normalized = vectorized_normalized.fit_transform(train_data['data'])
    # test_input_normalized = vectorized_normalized.transform(test_data['data'])

    vectorized = feature_extraction.CountVectorizer(min_df=1)
    train_input = vectorized.fit_transform(train_data['data'])
    test_input = vectorized.transform(test_data['data'])

    return train_input, train_data['target'], test_input, test_data['target']
コード例 #10
0
def test_docs(dir):
	# Load documents
	docs = datasets.load_files(container_path="../../sklearn_data/"+dir)
	X, y = docs.data, docs.target

	baseline = 1/float(len(list(np.unique(y))))

	# Select Features via Bag of Words approach without stop words
	#X = CountVectorizer(charset_error='ignore', stop_words='english', strip_accents='unicode', ).fit_transform(X)
	X = TfidfVectorizer(charset_error='ignore', stop_words='english', analyzer='char', ngram_range=(2,4), strip_accents='unicode', sublinear_tf=True, max_df=0.5).fit_transform(X)
	n_samples, n_features = X.shape


	# sklearn's grid search
	parameters = { 'alpha': np.logspace(-100,0,10)}

	bv = Bootstrap(n_samples, n_iter=10, test_size=0.3, random_state=42)
	mnb_gv = GridSearchCV(MultinomialNB(), parameters, cv=bv,)
	#scores = cross_val_score(mnb_gv, X, y, cv=bv)
	mnb_gv.fit(X, y)
	mnb_gv_best_params = mnb_gv.best_params_.values()[0]
	print mnb_gv.best_score_
	print mnb_gv_best_params

	# CV with Bootstrap
	mnb = MultinomialNB(alpha=mnb_gv_best_params)
	boot_scores = cross_val_score(mnb, X, y, cv=bv)
	print mean_sem(boot_scores)

	improvement = (mnb_gv.best_score_ - baseline) / baseline

	rand_baseline.append(baseline)
	test_results.append([mnb_gv.best_score_])
	com_results.append(improvement)
	sem_results.append(sem(boot_scores))
コード例 #11
0
ファイル: _polarity.py プロジェクト: PaulHuygen/xtas
def train(param_search=False):
    data = load_files(download())
    y = [data.target_names[t] for t in data.target]

    # The random state on the LR estimator is fixed to the most arbitrary value
    # that I could come up with. It is biased toward the middle number keys on
    # my keyboard.
    clf = make_pipeline(TfidfVectorizer(min_df=2, dtype=float,
                                        sublinear_tf=True,
                                        ngram_range=(1, 2),
                                        strip_accents='unicode'),
                        LogisticRegression(random_state=623, C=5000))

    if param_search:
        params = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                  'lr__C': [1000, 5000, 10000]}

        print("Starting parameter search for review sentiment classification")
        # We ignore the original folds in the data, preferring a simple 5-fold
        # CV instead; this is intended to get a working model, not results for
        # publication.
        gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2)
        gs.fit(data.data, y)

        print("Parameters found:")
        pprint(gs.best_params_)
        print("Cross-validation accuracy: %.3f" % gs.best_score_)

        return gs.best_estimator_

    else:
        print("Training logistic regression for movie review polarity")
        return clf.fit(data.data, y)
コード例 #12
0
def load_data():
    # Descargamos los datos, los descomprimimos en la carpeta ./data/txt_sentoken
    # "http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz")
    dataset = load_files('./data/txt_sentoken', shuffle=False)
    print("n_samples: %d" % len(dataset.data))

    return dataset
コード例 #13
0
def test_load_files_w_categories_desc_and_encoding():
    category = os.path.abspath(TEST_CATEGORY_DIR1).split("/").pop()
    res = load_files(LOAD_FILES_ROOT, description="test", categories=category, encoding="utf-8")
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 1)
    assert_equal(res.DESCR, "test")
    assert_equal(res.data, [u("Hello World!\n")])
コード例 #14
0
def load_docs(path):
    dataset = load_files(args.train_path)
    docs = []
    for raw_data in dataset.data:
        docs.append(json.loads(raw_data))
    dataset.data = docs
    return dataset
コード例 #15
0
ファイル: model.py プロジェクト: krsreenatha/IndyRef
def importData(datadirectory):
	#categories = ['n','u', 'y']
	categories = ['n', 'y']

	data = load_files(datadirectory,categories=categories, shuffle=True, random_state=42, encoding='latin-1') 
	X_train, X_test, y_train, y_test = cross_validation.train_test_split(data.data, data.target, test_size = 0.4, random_state=0)
	print X_train 
	# count_vect = CountVectorizer()
	# X_train_vec = count_vect.fit_transform(X_train)
	# X_test_vec = count_vect.fit_transform(X_test)
	# clf = svm.SVC(kernel='linear', C=1).fit(X_train_vec, y_train)
	# clf.score(X_test_vec, y_test) 

	text_clf = Pipeline([('vect', TfidfVectorizer()), ('clf', MultinomialNB())])
	#print text_clf.named_steps['clf']
	print str(sum(cross_val_score(text_clf, data.data,data.target ))/3.0) + ' Tfidf NB'
	#array([ 0.62376238,  0.57      ,  0.6122449 ])
	text_clf = Pipeline([('vect', CountVectorizer()),('clf', MultinomialNB()),]) 
	print str(sum(cross_val_score(text_clf, data.data,data.target ))/3.0) + ' CountVec NB'                                         #array([ 0.56435644,  0.5       ,  0.57142857])
	clf = Pipeline([('vect', CountVectorizer()), ('svm', LinearSVC())])                        
	print str(sum(cross_val_score(clf, data.data,data.target ))/3.0) + ' CountVec SVM'
	#array([ 0.55445545,  0.48      ,  0.54081633])
	clf = Pipeline([('vect', TfidfVectorizer()), ('svm', LinearSVC())])                    
	print str(sum(cross_val_score(clf, data.data,data.target ))/3.0) + ' Tfidf SVM'
	#array([ 0.62376238,  0.57      ,  0.6122449 ])
	clf_sgdc = Pipeline([('vect', CountVectorizer()),('clf', linear_model.SGDClassifier()),])
	print str(sum(cross_val_score(clf_sgdc, data.data,data.target ))/3.0) + ' SGDC' 
コード例 #16
0
ファイル: load_SRAA.py プロジェクト: dzhuang2/active_learn
def load_SRAA(AVI_HOME='./SRAA/partition1/data', percent=1./3, rnd=2342, \
              vect=CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))):
    data = load_files(AVI_HOME, encoding="latin1", load_content=True, random_state=rnd)
    data.data = [remove_header_subject(text) for text in data.data]

    indices = ShuffleSplit(len(data.data), n_iter=1, test_size=percent, indices=True, random_state=rnd)
    for train_ind, test_ind in indices:
        data = bunch.Bunch(train=bunch.Bunch(data=[data.data[i] for i in train_ind], target=data.target[train_ind]),
                              test=bunch.Bunch(data=[data.data[i] for i in test_ind], target=data.target[test_ind]))

    X_tr = vect.fit_transform(data.train.data)
    y_tr = data.train.target

    X_te = vect.transform(data.test.data)
    y_te = data.test.target
    
    # cache the files
    pickle.dump(X_tr, open('SRAA_X_train.pickle', 'wb'))
    pickle.dump(y_tr, open('SRAA_y_train.pickle', 'wb'))
    pickle.dump(X_te, open('SRAA_X_test.pickle', 'wb'))
    pickle.dump(y_te, open('SRAA_y_test.pickle', 'wb'))
    pickle.dump(data.train.data, open('SRAA_X_train_corpus.pickle', 'wb'))
    pickle.dump(data.test.data, open('SRAA_X_test_corpus.pickle', 'wb'))
    pickle.dump(vect.get_feature_names(), open('SRAA_feature_names.pickle', 'wb'))
    
    return (X_tr, y_tr, X_te, y_te, data.train.data, data.test.data)
コード例 #17
0
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)  
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
    
    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary = True)   
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train]
    term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num]
    
    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary_ = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec= vectorizer.transform(doc_str_list_test)
    
    clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类
    doc_test_predicted = clf.predict(doc_test_vec)
    
    acc = np.mean(doc_test_predicted == doc_class_list_test)  
    print 'Accuracy: ', acc
    
    return acc
コード例 #18
0
def test_default_load_files(test_category_dir_1, test_category_dir_2,
                            load_files_root):
    res = load_files(load_files_root)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.data, [b("Hello World!\n")])
コード例 #19
0
ファイル: analysis.py プロジェクト: colinricardo28/Peepl
def testdata_stats():
    test_dataset = datasets.load_files(project_root+"/testdata",
                                     encoding='utf-8',
                                  decode_error='ignore')

    # save_thing_to_file(test_dataset, "test_dataset.txt")

    bayes = get_thing_from_file("bayes.txt")
    bayes.fit(test_dataset.data, test_dataset.target)
    predicted_nb = bayes.predict(test_dataset.data)

    print "*****BAYESIAN STATS****"
    print "average accuracy = " + \
            str(numpy.mean(predicted_nb == test_dataset.target))

    print(metrics.classification_report(test_dataset.target, predicted_nb,
    target_names=test_dataset.target_names))
    print "*****BAYESIAN CONFUSION MATRIX*****"
    print metrics.confusion_matrix(test_dataset.target, predicted_nb)

    svm = get_thing_from_file("svm.txt")
    svm.fit(test_dataset.data, test_dataset.target)
    predicted_svm = svm.predict(test_dataset.data)

    print "*****SVM STATS*****"
    print "average accuracy = " + \
            str(numpy.mean(predicted_svm == test_dataset.target))
    print(metrics.classification_report(test_dataset.target, predicted_svm,
    target_names=test_dataset.target_names))
    print "*****SVM CONFUSION MATRIX*****"
    print metrics.confusion_matrix(test_dataset.target, predicted_svm)
コード例 #20
0
def test_load_files_wo_load_content(
        test_category_dir_1, test_category_dir_2, load_files_root):
    res = load_files(load_files_root, load_content=False)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.get('data'), None)
コード例 #21
0
ファイル: nb.py プロジェクト: maya-ramanath/OP
def runClassifiers (dataDir):
    
    data = load_files(dataDir)

    nbClassifier = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('classifier', MultinomialNB())])
    
    parameters = {'vect__ngram_range': [(1,1),(2,2),(3,3),(1,2),(1,3)],
                  'vect__binary': [True, False],
                  'tfidf__use_idf': [True, False],
                  'classifier__alpha': [1e-2, 1e-3]}
    
    gs = GridSearchCV(nbClassifier, parameters, n_jobs=-1, verbose=1)
    gs.fit(data.data, data.target)
    best_parameters = gs.best_estimator_.get_params()
    
    print("Best score: %0.3f" % gs.best_score_)
    for params, mean_score, scores in gs.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    print("Done")
    
    pass
コード例 #22
0
ファイル: Classifier.py プロジェクト: sherkin735/dmsapp
    def __init__(self, file_path):
        self.training_documents = load_files(container_path='./20news-bydate/20news-bydate-train',
                                       categories=CATEGORIES,
                                       decode_error='ignore',
                                       shuffle=True,
                                       encoding='utf-8',
                                       random_state=42)

        self.test_documents = load_files(container_path='./20news-bydate/20news-bydate-test',
                                       categories=CATEGORIES,
                                       decode_error='ignore',
                                       shuffle=True,
                                       encoding='utf-8',
                                       random_state=42)

        self.file_path = file_path
コード例 #23
0
ファイル: sentiment.py プロジェクト: amangarg078/TextGenius
def text_sentiment(docs_new):
   docs_new=[docs_new]
   twenty_train= load_files('./Sentiment')  #the complete data is in this directory; like comp.graphics etc
   count_vect = CountVectorizer()
   X_train_counts = count_vect.fit_transform(twenty_train.data)
   tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
   X_train_tf = tf_transformer.transform(X_train_counts)
   tfidf_transformer = TfidfTransformer()
   X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

   # Fit a classifier on the training set
   #clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
   #f = open('my_classifier.pickle', 'wb')
   #pickle.dump(clf, f)
   #f = open('my_classifier.pickle',)
   #clf = pickle.load(f)
   #f.close()
   # save the classifier
   #with open('my_sentiment.pkl', 'wb') as fid:
      #cPickle.dump(clf, fid)    

   # load it again
   with open('my_sentiment.pkl', 'rb') as fid:
      clf = cPickle.load(fid)
   X_new_counts = count_vect.transform(docs_new)
   X_new_tfidf = tfidf_transformer.transform(X_new_counts)

   predicted = clf.predict(X_new_tfidf)
   return twenty_train.target_names[predicted]
コード例 #24
0
ファイル: dataset.py プロジェクト: wkentaro/apc-od
def get_inbin_depth(which_set):
    if which_set not in ('train', 'test'):
        raise ValueError

    data_dir = os.path.join(here, '../data/inbin_depth_{0}'.format(which_set))
    data = load_files(data_dir, load_content=False, shuffle=False)
    return data
コード例 #25
0
    def train(self):
        """Loading and Training classifier"""
        # Load dataset
        categories = ['neg', 'pos']
        self.train_set = load_files('resources/sentimentDataset/train/', categories=categories, encoding='latin-1')
        self.test_set = load_files('resources/sentimentDataset/test/', categories=categories, encoding='latin-1')

        #Tokenizing text with scikit-learn
        count_vect = CountVectorizer()
        X_train_counts = count_vect.fit_transform(self.train_set.data)

        # occurrences to frequencies
        tfidf_transformer = TfidfTransformer()
        X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

        # Pipline
        self.text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])
        self.text_clf.fit(self.train_set.data, self.train_set.target)
コード例 #26
0
def test_load_files_w_categories_desc_and_encoding(
        test_category_dir_1, test_category_dir_2, load_files_root):
    category = os.path.abspath(test_category_dir_1).split('/').pop()
    res = load_files(load_files_root, description="test",
                     categories=category, encoding="utf-8")
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 1)
    assert_equal(res.DESCR, "test")
    assert_equal(res.data, [u("Hello World!\n")])
コード例 #27
0
def load_data(data_path, data_categories):
    return load_files(container_path=data_path,
                      description=None,
                      categories=data_categories,
                      load_content=True,
                      shuffle=True,
                      encoding='latin-1',
                      decode_error='strict',
                      random_state=randint(0, 999999))
コード例 #28
0
ファイル: test_base.py プロジェクト: TaihuaLi/scikit-learn
def test_default_load_files():
    try:
        setup_load_files()
        res = load_files(LOAD_FILES_ROOT)
        assert_equal(len(res.filenames), 1)
        assert_equal(len(res.target_names), 2)
        assert_equal(res.DESCR, None)
        assert_equal(res.data, [b("Hello World!\n")])
    finally:
        teardown_load_files()
コード例 #29
0
ファイル: test_base.py プロジェクト: TaihuaLi/scikit-learn
def test_load_files_wo_load_content():
    try:
        setup_load_files()
        res = load_files(LOAD_FILES_ROOT, load_content=False)
        assert_equal(len(res.filenames), 1)
        assert_equal(len(res.target_names), 2)
        assert_equal(res.DESCR, None)
        assert_equal(res.get('data'), None)
    finally:
        teardown_load_files()
コード例 #30
0
ファイル: API.py プロジェクト: OdedH/AA-project
def build_histogram(path="./data", name="hist"):

    # here we create a Bunch object ['target_names', 'data', 'target', 'DESCR', 'filenames']
    raw_bunch = datasets.load_files(path, description=None, categories=None, load_content=True,
                                    shuffle=True, encoding='utf-8', decode_error='replace')
    quantities = {author: 0 for author in list(raw_bunch['target_names'])}
    for i in list(raw_bunch['target']):
        quantities[list(raw_bunch['target_names'])[i]]+=1
    plt.figure(figsize=(17, 7), dpi=80, facecolor='w', edgecolor='k')
    plt.bar(range(len(quantities)), quantities.values(), align='center')
    plt.xticks(range(len(quantities)), quantities.keys())
    savefig(name + '.png')
コード例 #31
0
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score, confusion_matrix

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

from keras.models import Sequential, Model
from keras.layers import Dense
from keras.regularizers import l1

from keras import backend as K

import numpy as np

documents = load_files('../TEXTDATA/', shuffle=False)

# Split remainder into training and testing
X_train, X_test, y_train, y_test = train_test_split(documents.data,
                                                    documents.target,
                                                    test_size=0.15)

#import code
#code.interact(local=locals())

count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
コード例 #32
0
def test_default_empty_load_files():
    res = load_files(LOAD_FILES_ROOT)
    assert_equal(len(res.filenames), 0)
    assert_equal(len(res.target_names), 0)
    assert_equal(res.DESCR, None)
コード例 #33
0
def test_load_files_wo_load_content():
    res = load_files(LOAD_FILES_ROOT, load_content=False)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.get('data'), None)
コード例 #34
0
from nltk.corpus import stopwords
import nltk
import re
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import roc_auc_score

import seaborn as sns

Stop_words = stopwords.words('english')

dataset = load_files('Web Page Classification/')

#datasetimizi oluşturuyoruz
#X:dataset
#Y:Classlar

X, y = dataset.data, dataset.target

from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()

corpus = []  #Documentları corpus kaydediyoruz
#fakat doha öncesinde preprocessing(boşlukların atılması,noktalama işretleri,a an )

for i in range(0, len(X)):
コード例 #35
0
ファイル: resumes.py プロジェクト: gr-b/mqp-workstation
# movie reviews

import numpy as np

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

resumedir = 'data/resumes_and_reviews'
resumes = load_files(resumedir, shuffle=True)

# Split remainder into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    resumes.data, resumes.target, test_size=0.50
)

count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
resume_matrix = X_train_tfidf[y_train == 0]
top_words = []

svd = TruncatedSVD(n_components=20)
コード例 #36
0
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from sklearn.linear_model import LogisticRegression


ps = PorterStemmer()
bookdir = r'Book'
# loading all files as training data.
book_train = load_files(bookdir, shuffle=True)
#print(book_train.data)
# target names ("classes") are automatically generated from subfolder names
#print(book_train.target_names)
#print(book_train.filenames)

#nltk.download('sentiwordnet')

stopwd  = set(sw.words('english'))

lemmatizer = WordNetLemmatizer()

def lemmatize_texte( token, tag, normalize):
    tag = {
        'N': wn.NOUN,
        'V': wn.VERB,
コード例 #37
0
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from sklearn.datasets import load_files
# for reproducibility
random_state = 0

print(
    "[+] NLP Clustering by https://sanjayasubedi.com.np/nlp/nlp-with-python-document-clustering/"
)

DATA_DIR = "./bbc/"
print("[+] Load files")
data = load_files(DATA_DIR,
                  encoding="utf-8",
                  decode_error="replace",
                  random_state=random_state)
df = pd.DataFrame(list(zip(data['data'], data['target'])),
                  columns=['text', 'label'])
df.head()

print("[+] Calculate tf-id")
# Learn vocabolary
vec = TfidfVectorizer(stop_words="english")
vec.fit(df.text.values)

print("[+] Define cluster")
# Crea modello
cls = MiniBatchKMeans(n_clusters=5, random_state=random_state)
features = vec.transform(df.text.values)
cls.fit(features)
コード例 #38
0
def load_dataset(path):
    data = load_files(path)
    dog_files = np.array(data['filenames'])
    dog_targets = np_utils.to_categorical(np.array(data['target']), 120)
    return dog_files, dog_targets
コード例 #39
0
from sklearn.neighbors import KNeighborsClassifier

stemmer = EnglishStemmer()


def stemming_tokenizer(text):
    stemmed_text = [
        stemmer.stem(word) for word in word_tokenize(text, language='english')
    ]
    return stemmed_text


data_folder_training_set = "./Training"
data_folder_test_set = "./Test"

training_dataset = load_files(data_folder_training_set)
test_dataset = load_files(data_folder_test_set)

print(training_dataset.target_names)

# Load Training-Set
X_train, X_test_DUMMY_to_ignore, Y_train, Y_test_DUMMY_to_ignore = train_test_split(
    training_dataset.data, training_dataset.target, test_size=0.0)
target_names = training_dataset.target_names

# Load Test-Set
X_train_DUMMY_to_ignore, X_test, Y_train_DUMMY_to_ignore, Y_test = train_test_split(
    test_dataset.data, test_dataset.target, train_size=0.0)
target_names = training_dataset.target_names
print(Y_train.shape)
print(Y_test.shape)
コード例 #40
0
# Author: Olivier Grisel <*****@*****.**>
# License: Simplified BSD
# Adapted by: Francesco Mosconi

import numpy as np

from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# The training data folder must be passed as first argument
try:
    dataset = load_files('./wikidata/short_paragraphs')
except OSError as ex:
    print(ex)
    print(
        "Couldn't import the data, try running `python fetch_data.py` first ")
    exit(-1)

# TASK: Split the dataset in training and test set
# (use 20% of the data for test):
x_train, x_test, y_train, y_test = train_test_split(dataset.data,
                                                    dataset.target,
                                                    test_size=0.20,
                                                    random_state=0)
# TASK: Build a an vectorizer that splits strings into sequence of 1 to 3
# characters instead of word tokens using the class TfidfVectorizer
コード例 #41
0
#coding:utf-8
"""
Kmeans算法聚类文本示例
"""

import matplotlib.pyplot as plt
import numpy as np

# 加载文本数据
from time import time
from sklearn.datasets import load_files
print("loading documents ...")
t = time()
docs = load_files('data/cluster_data')
print("summary: {0} documents in {1} categories.".format(
    len(docs.data), len(docs.target_names)))
print("done in {0} seconds".format(time() - t))

# 文本向量化表示
from sklearn.feature_extraction.text import TfidfVectorizer
max_features = 20000
print("vectorizing documents ...")
t = time()
vectorizer = TfidfVectorizer(max_df=0.4,
                             min_df=2,
                             max_features=max_features,
                             encoding='latin-1')
X = vectorizer.fit_transform((d for d in docs.data))
print("n_samples: %d, n_features: %d" % X.shape)
print("number of non-zero features in sample [{0}]: {1}".format(
    docs.filenames[0], X[0].getnnz()))
コード例 #42
0
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn import metrics

from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.optimizers import Adam

resumes = load_files('data/actingAndManagerResumes/', shuffle=True)

# Split remainder into training and testing
X_train, X_test, y_train, y_test = train_test_split(resumes.data,
                                                    resumes.target,
                                                    test_size=0.20)

count_vect = CountVectorizer()
X_train_count = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)

pca = TruncatedSVD(n_components=20)
pca.fit(X_train_tfidf)
コード例 #43
0
#!/usr/bin/env python
# coding: utf-8

# # Project ML / DSA : Sentiment Analysis dengan Hybrid ANN-NB Classifier

# ### Read Feature (Text) and Target (Rating)

# In[1]:

from sklearn.datasets import load_files
import numpy as np

reviews = load_files("dataset", encoding="ISO-8859-1")
texts, rating = reviews.data, reviews.target

# ### Preprocessing

# In[2]:

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
コード例 #44
0
def load_dataset(path):
    data = load_files(path)
    medical_files = np.array(data['filenames'])
    medical_targets = np_utils.to_categorical(np.array(data['target']), 3)
    return medical_files, medical_targets
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.datasets import load_files

import pickle
from nltk.corpus import stopwords

# In[2]:

dataset = load_files(r"C:\Users\ozgur\Desktop\inputset2", encoding="utf-8")
X = dataset.data
y = dataset.target
print(y)

# In[3]:

documents = []
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
コード例 #46
0
    return c_clf


# Out-of-core Training
train_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'train')
train_pos = os.path.join(train_path, 'pos')
train_neg = os.path.join(train_path, 'neg')

fnames = [os.path.join(train_pos, f) for f in os.listdir(train_pos)] +\
         [os.path.join(train_neg, f) for f in os.listdir(train_neg)]
y_train = np.zeros((len(fnames), ), dtype=int)
y_train[:12500] = 1
np.bincount(y_train)

sgd = SGDClassifier(loss='log', random_state=1)

sgd = batch_train(clf=sgd,
                  fnames=fnames,
                  labels=y_train)


# Testing
test_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'test')
test = load_files(container_path=(test_path),
                  categories=['pos', 'neg'])
docs_test, y_test = test['data'][12500:], test['target'][12500:]

vec = HashingVectorizer(encoding='latin-1')
print('accuracy:', sgd.score(vec.transform(docs_test), y_test))
コード例 #47
0
import shutil
from sklearn import preprocessing

from sklearn.datasets import load_files
from sklearn.feature_extraction.text import TfidfVectorizer

NUM_QUESTIONS = 3
PLOT_RESULTS = False
ACTIVE = True
DATA_FOLDER = "/home/af/Downloads/movie_review_kfold/review_polarity/activelearning"
TRAIN_FOLDER = os.path.join(DATA_FOLDER, "train")
TEST_FOLDER = os.path.join(DATA_FOLDER, "test")
UNLABELED_FOLDER = os.path.join(DATA_FOLDER, "unlabeled")
ENCODING = 'latin1'
while True:
    data_train = load_files(TRAIN_FOLDER, encoding=ENCODING)
    data_test = load_files(TEST_FOLDER, encoding=ENCODING)
    data_unlabeled = load_files(UNLABELED_FOLDER, encoding=ENCODING)

    categories = data_train.target_names

    def size_mb(docs):
        return sum(len(s.encode('utf-8')) for s in docs) / 1e6

    data_train_size_mb = size_mb(data_train.data)
    data_test_size_mb = size_mb(data_test.data)
    data_unlabeled_size_mb = size_mb(data_unlabeled.data)

    print("%d documents - %0.3fMB (training set)" %
          (len(data_train.data), data_train_size_mb))
    print("%d documents - %0.3fMB (test set)" %
コード例 #48
0
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.externals import joblib

# The training data folder must be passed as first argument
languages_data_folder = '/home/janrn/Development/machinelearning/articles'
dataset = load_files(languages_data_folder)

# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(dataset.data,
                                                          dataset.target,
                                                          test_size=0.5)

# TASK: Build a an vectorizer
clf = Pipeline([('vect', TfidfVectorizer(max_df=0.9, min_df=2)),
                ('clf', LinearSVC())])

# fit the pipeline on training data
clf.fit(docs_train, y_train)

# fit pipeline on all the data (no test)
#clf.fit(dataset.data, dataset.target)

# get category names
# print dataset.target_names
コード例 #49
0
def load_dataset(path):
    data = load_files(path)
    print(data)
    part_files = np.array(sorted(glob("data/*")))
    return part_files
コード例 #50
0
def load_dataset(path):
    data = load_files(path)
    mushroom_files = np.array(data['filenames'])
    mushroom_targets = np_utils.to_categorical(np.array(data['target']), 10)
    return mushroom_files, mushroom_targets
コード例 #51
0
import copy
import codecs as cs
from sklearn.datasets import load_files
from sklearn import svm
from sklearn import metrics
from random import shuffle
from ficlearn.feature_extraction.text import BnsVectorizer

if __name__ == '__main__':
    print("-----------------------------------------------")
    print("Load corpus and vectorize with BNSVectorizer")
    print("-----------------------------------------------")
    corpus = "corpus6"
    label_names = ['relevant', 'spam']

    notices = load_files(corpus, categories=label_names, load_content=False)
    data = [
        cs.open(filename, 'r', 'UTF-8').read()
        for filename in notices.filenames
    ]
    n_samples = len(data)
    Y = notices.target

    start = int(n_samples / 10)
    step = start
    recalls = []
    precisions = []
    sizes = []
    N_SAMPLES = copy.deepcopy(n_samples)
    for i in range(2, 10, 1):
        sliceIndex = int((i * 0.1 + 0.1) * N_SAMPLES)
コード例 #52
0
# Text Classifiation using NLP

# Importing the libraries
import numpy as np
import re
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files
nltk.download('stopwords')

# Importing the dataset
reviews = load_files('class/')
X, y = reviews.data, reviews.target

# Pickling the dataset
with open('X.pickle', 'wb') as f:
    pickle.dump(X, f)

with open('y.pickle', 'wb') as f:
    pickle.dump(y, f)

# Unpickling dataset
X_in = open('X.pickle', 'rb')
y_in = open('y.pickle', 'rb')
X = pickle.load(X_in)
y = pickle.load(y_in)

# Creating the corpus
corpus = []
for i in range(0, 2000):
コード例 #53
0
ファイル: array_test.py プロジェクト: bits2018wilp/python-lab
from sklearn.datasets import load_digits
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn import datasets

datasets.load_digits()

cpath = 'D:\\kaggle\\basicshapes\\shapes\\' + 'circles'

datasets.load_files(cpath)
a = [1, 2, 3, 4, 5]
print(len(a))
print(a[1])
a[2] = 8
print(a.append(88), a)

a = [1, 2, 3, 4, 5]
b = [1, 2, 3, 4, 5, 6]

print(a == b)
print(5 in b)
コード例 #54
0
    # categories = [
    #     'alt.atheism',
    #     'talk.religion.misc',
    #     'comp.graphics',
    #     'sci.space',
    # ]
    # load dataset
    #print("Loading 20 newsgroups dataset for categories:")
    #print(categories if categories else "all")

    # data_train = fetch_20newsgroups(subset='train', categories=categories,
    #                                 shuffle=True, random_state=42)
    #data_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

    #data_test = fetch_20newsgroups(subset='test',shuffle=True, random_state=42)
    data_train = load_files('train')
    data_test = load_files('test')
    print('data loaded')

    # order of labels in `target_names` can be different from `categories`
    target_names = data_train.target_names

    #data_train_size_mb = size_mb(data_train.data)
    #data_test_size_mb = size_mb(data_test.data)

    # print("%d documents - %0.3fMB (training set)" % (
    #     len(data_train.data), data_train_size_mb))
    # print("%d documents - %0.3fMB (test set)" % (
    #     len(data_test.data), data_test_size_mb))
    #print("%d categories" % len(categories))
    #print()
コード例 #55
0
ファイル: classifier.py プロジェクト: markgyalus/444crawl
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

language_data_folder = r'C:\Users\Imre\PycharmProjects\dirteszt\Webcrawl Excercise\crawl\data'
dataset = load_files(language_data_folder, encoding='utf-8', shuffle=True)

train_data, test_data, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.3)

vect = TfidfVectorizer(max_df=0.95, min_df=3)

my_clf = Pipeline([('vector', vect),
                   ('clf', LinearSVC(C=1000))])

my_clf.fit(train_data, y_train)

y_predicted = my_clf.predict(test_data)

# print(help(metrics.classification_report))
# print(metrics.classification_report(y_test, y_predicted, labels=range(13),
#                                     target_names=dataset.target_names))

test_sentences = ['Az iráni látogatáson lévő White tavalyi letartóztatásáról egy emigráns szervezet adott először hírt\
, ami egy kiszabadult fogolytól azt tudta meg, hogy 2018 októberében, egy Meshed városában lévő börtönben találkozott\
 vele.']

predicted = my_clf.predict(test_sentences)
コード例 #56
0
def load_dataset(path):
    data = load_files(path)
    files = np.array(data['filenames'])
    targets = np.array(data['target'])
    target_labels = np.array(data['target_names'])
    return files, targets, target_labels
コード例 #57
0
def load_dataset(path):
    data = load_files(path)
    fire_files = np.array(data['filenames'])
    fire_targets = np_utils.to_categorical(np.array(data['target']),
                                           num_classes)
    return fire_files, fire_targets
コード例 #58
0
def test_default_load_files():
    res = load_files(LOAD_FILES_ROOT)
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 2)
    assert_equal(res.DESCR, None)
    assert_equal(res.data, [b("Hello World!\n")])
コード例 #59
0
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics


if __name__ == "__main__":
    # NOTE: we put the following in a 'if __name__ == "__main__"' protected
    # block to be able to use a multi-core grid search that also works under
    # Windows, see: http://docs.python.org/library/multiprocessing.html#windows
    # The multiprocessing module is used as the backend of joblib.Parallel
    # that is used when n_jobs != 1 in GridSearchCV

    # the training data folder must be passed as first argument
    movie_reviews_data_folder = sys.argv[1]
    dataset = load_files(movie_reviews_data_folder, shuffle=False)
    print("n_samples: %d" % len(dataset.data))

    # split the dataset in training and test set:
    docs_train, docs_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.25, random_state=None)

    # TASK: Build a vectorizer / classifier pipeline that filters out tokens
    # that are too rare or too frequent
    pipeline = Pipeline([
        ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
        ('clf', LinearSVC(C=1000)),
    ])

    # TASK: Build a grid search to find out whether unigrams or bigrams are
    # more useful.
コード例 #60
0
    '''Extract TF-IDF features from corpus'''
    count_vectorizer = feature_extraction.text.CountVectorizer(
        lowercase=True,
        tokenizer=nltk.word_tokenize,  # use the NLTK tokenizer
        stop_words='english',  # remove stop words
        min_df=1  # minimum document frequency
    )
    processed_corpus = count_vectorizer.fit_transform(corpus)
    processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform(
        processed_corpus)

    return processed_corpus


data_directory = 'tweets'
tweets_sent_data = datasets.load_files(data_directory, shuffle=True)
print('{} files loaded.'.format(len(tweets_sent_data.data)))
print('They contain the following classes: {}.'.format(
    tweets_sent_data.target_names))

tweets_tfidf = extract_features(tweets_sent_data.data)

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    tweets_tfidf, tweets_sent_data.target, test_size=0.30, random_state=42)


model = linear_model.LogisticRegression()
model.fit(X_train, y_train)
print('Model performance: {}'.format(model.score(X_test, y_test)))

y_pred = model.predict(X_test)