def getData(): train_data= load_files('training') test_data=load_files("test") count_Vec=TfidfVectorizer(min_df=1,decode_error="replace") doc_train=count_Vec.fit_transform(train_data.data) doc_test=count_Vec.transform(test_data.data)# ! there is transform not fit_transform return doc_train.toarray(),train_data.target,doc_test.toarray()
def export_classifier(): #note that this data is not in the git repo train_small = load_files('./training_data/') test_small = load_files('./test_data/') # Turn the text documents into vectors of word frequencies vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 2), stop_words='english', strip_accents='ascii') X_train = vectorizer.fit_transform(train_small.data) y_train = train_small.target # Fit a classifier on the training set classifier = LogisticRegression(penalty='l2', tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, ).fit(X_train, y_train) print("Training score: {0:.1f}%".format( classifier.score(X_train, y_train) * 100)) # Evaluate the classifier on the testing set X_test = vectorizer.transform(test_small.data) y_test = test_small.target print("Testing score: {0:.1f}%".format( classifier.score(X_test, y_test) * 100)) export_pickle('LRclassifier.txt', classifier) export_pickle('LRvectorizer.txt', vectorizer)
def getData(): train_data= load_files('dataset/train') test_data=load_files("dataset/test") count_Vec=TfidfVectorizer(min_df=1,decode_error="replace") doc_train=count_Vec.fit_transform(train_data.data) doc_test=count_Vec.transform(test_data.data) return doc_train.toarray(),train_data.target,doc_test.toarray(),test_data.target
def createDataSet(train_path,test_path,category,k): """ create vectorized text feature '0' refer to 'atheism' '1' refer to 'sports' """ train_set = datasets.load_files(train_path,categories=category, load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0) count_vect = CountVectorizer(encoding = 'utf-8',lowercase = True, decode_error = 'ignore', analyzer = 'word', ngram_range = (2,4),min_df = 1) tfidf_vecter = TfidfVectorizer( max_df = 0.8, stop_words = 'english') test_set = datasets.load_files(test_path,categories=category, load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0) X_train_tfidf = tfidf_vecter.fit_transform(train_set.data) X_train_counts = count_vect.fit_transform(train_set.data) X_test_tfidf = tfidf_vecter.transform(test_set.data) X_test_counts = count_vect.transform(test_set.data) for i in range(X_train_counts.shape[0]): if train_set.target[i] == k: train_set.target[i] = 1 else: train_set.target[i] = -1 for i in range(X_test_counts.shape[0]): if test_set.target[i] == k: test_set.target[i] = 1 else: test_set.target[i] = -1 #X_train_normalize = preprocessing.normalize(X_train_counts, norm = 'l2') #print train_set.target_names #print train_set.target #print size #print len(train_set.target) #print X_train_tfidf.shape #print X_train_counts #print X_train_normalize return X_train_counts, train_set.target, X_train_counts.shape,X_test_counts, test_set.target, X_test_counts.shape
def load(dataset, categories): if dataset == 'full': train = load_files('aclImdb/aggregate/', categories=categories) return train elif dataset == 'split': train = load_files('aclImdb/train/', categories=categories) test = load_files('aclImdb/test/', categories=categories) return (train, test)
def vector_for_input_binary(train_file_path="/mnt/hgfs/temp/machine learning/train", test_file_path="/mnt/hgfs/temp/machine learning/test", categories=None): train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore') test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore') vectorized = feature_extraction.CountVectorizer(min_df=1, binary=True) train_input = vectorized.fit_transform(train_data['data']) test_input = vectorized.transform(test_data['data']) return train_input, train_data['target'], test_input, test_data['target']
def test_grid_search_cv_on_newsgroup(): ## load news group data categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] twenty_train_small = load_files('./data/20news-bydate-train/', categories=categories, charset='latin-1') twenty_test_small = load_files('./data/20news-bydate-test/', categories=categories, charset='latin-1') ## model pipeline using tfidf and passive aggresive pipeline = Pipeline(( ('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)), ('clf', PassiveAggressiveClassifier(C=1)), )) param_grid = { 'vec__min_df': [1, 2], 'vec__max_df': [0.8, 1.0], 'vec__ngram_range': [(1, 1), (1, 2)], 'vec__use_idf': [True, False] } X, y = twenty_train_small.data, twenty_train_small.target ## cross validation on n_iter = 5 grid_searcher = meta_search.GridSearch() # persist only once grid_searcher.persist_cv_splits('text_classification', X, y, './tmp/') grid_searcher.search(pipeline, param_grid) import time while not grid_searcher.isready(): print time.sleep(2) print 'progress:', grid_searcher.progress() print 'best result:', grid_searcher.best_params_so_far() if grid_searcher.best_params_so_far(): pass#grid_searcher.abort() print len(grid_searcher.partial_result()) ## run again with naive bayesian ## no need to persist_cv_splits pipeline = Pipeline(( ('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)), ('clf', MultinomialNB()), )) grid_searcher10 = meta_search.GridSearch(datafiles = grid_searcher.datafiles) grid_searcher10.search(pipeline, param_grid) while not grid_searcher10.isready(): print time.sleep(2) print 'progress:', grid_searcher10.progress() print 'best result:', grid_searcher10.best_params_so_far() if grid_searcher10.best_params_so_far(): pass#grid_searcher10.abort() print len(grid_searcher10.partial_result())
def main(): #buildTrainSet() #buildTestSet() train = load_files('model/train', encoding='utf-8') test = load_files('model/test', encoding='utf-8') print train.cc # for l in train.target_names: # print l # for l in train.target: # print l vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english') X_train = vectorizer.fit(train) X_test = vectorizer.fit_transform(test) print X_train.get_feature_names()
def vector_for_input(train_file_path=path1, test_file_path=path2, categories=None): train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore') test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore') # vectorized_normalized = feature_extraction.TfidfVectorizer(min_df=1) # train_input_normalized = vectorized_normalized.fit_transform(train_data['data']) # test_input_normalized = vectorized_normalized.transform(test_data['data']) vectorized = feature_extraction.CountVectorizer(min_df=1) train_input = vectorized.fit_transform(train_data['data']) test_input = vectorized.transform(test_data['data']) return train_input, train_data['target'], test_input, test_data['target']
def test_docs(dir): # Load documents docs = datasets.load_files(container_path="../../sklearn_data/"+dir) X, y = docs.data, docs.target baseline = 1/float(len(list(np.unique(y)))) # Select Features via Bag of Words approach without stop words #X = CountVectorizer(charset_error='ignore', stop_words='english', strip_accents='unicode', ).fit_transform(X) X = TfidfVectorizer(charset_error='ignore', stop_words='english', analyzer='char', ngram_range=(2,4), strip_accents='unicode', sublinear_tf=True, max_df=0.5).fit_transform(X) n_samples, n_features = X.shape # sklearn's grid search parameters = { 'alpha': np.logspace(-100,0,10)} bv = Bootstrap(n_samples, n_iter=10, test_size=0.3, random_state=42) mnb_gv = GridSearchCV(MultinomialNB(), parameters, cv=bv,) #scores = cross_val_score(mnb_gv, X, y, cv=bv) mnb_gv.fit(X, y) mnb_gv_best_params = mnb_gv.best_params_.values()[0] print mnb_gv.best_score_ print mnb_gv_best_params # CV with Bootstrap mnb = MultinomialNB(alpha=mnb_gv_best_params) boot_scores = cross_val_score(mnb, X, y, cv=bv) print mean_sem(boot_scores) improvement = (mnb_gv.best_score_ - baseline) / baseline rand_baseline.append(baseline) test_results.append([mnb_gv.best_score_]) com_results.append(improvement) sem_results.append(sem(boot_scores))
def train(param_search=False): data = load_files(download()) y = [data.target_names[t] for t in data.target] # The random state on the LR estimator is fixed to the most arbitrary value # that I could come up with. It is biased toward the middle number keys on # my keyboard. clf = make_pipeline(TfidfVectorizer(min_df=2, dtype=float, sublinear_tf=True, ngram_range=(1, 2), strip_accents='unicode'), LogisticRegression(random_state=623, C=5000)) if param_search: params = {'tfidf__ngram_range': [(1, 1), (1, 2)], 'lr__C': [1000, 5000, 10000]} print("Starting parameter search for review sentiment classification") # We ignore the original folds in the data, preferring a simple 5-fold # CV instead; this is intended to get a working model, not results for # publication. gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2) gs.fit(data.data, y) print("Parameters found:") pprint(gs.best_params_) print("Cross-validation accuracy: %.3f" % gs.best_score_) return gs.best_estimator_ else: print("Training logistic regression for movie review polarity") return clf.fit(data.data, y)
def load_data(): # Descargamos los datos, los descomprimimos en la carpeta ./data/txt_sentoken # "http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz") dataset = load_files('./data/txt_sentoken', shuffle=False) print("n_samples: %d" % len(dataset.data)) return dataset
def test_load_files_w_categories_desc_and_encoding(): category = os.path.abspath(TEST_CATEGORY_DIR1).split("/").pop() res = load_files(LOAD_FILES_ROOT, description="test", categories=category, encoding="utf-8") assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 1) assert_equal(res.DESCR, "test") assert_equal(res.data, [u("Hello World!\n")])
def load_docs(path): dataset = load_files(args.train_path) docs = [] for raw_data in dataset.data: docs.append(json.loads(raw_data)) dataset.data = docs return dataset
def importData(datadirectory): #categories = ['n','u', 'y'] categories = ['n', 'y'] data = load_files(datadirectory,categories=categories, shuffle=True, random_state=42, encoding='latin-1') X_train, X_test, y_train, y_test = cross_validation.train_test_split(data.data, data.target, test_size = 0.4, random_state=0) print X_train # count_vect = CountVectorizer() # X_train_vec = count_vect.fit_transform(X_train) # X_test_vec = count_vect.fit_transform(X_test) # clf = svm.SVC(kernel='linear', C=1).fit(X_train_vec, y_train) # clf.score(X_test_vec, y_test) text_clf = Pipeline([('vect', TfidfVectorizer()), ('clf', MultinomialNB())]) #print text_clf.named_steps['clf'] print str(sum(cross_val_score(text_clf, data.data,data.target ))/3.0) + ' Tfidf NB' #array([ 0.62376238, 0.57 , 0.6122449 ]) text_clf = Pipeline([('vect', CountVectorizer()),('clf', MultinomialNB()),]) print str(sum(cross_val_score(text_clf, data.data,data.target ))/3.0) + ' CountVec NB' #array([ 0.56435644, 0.5 , 0.57142857]) clf = Pipeline([('vect', CountVectorizer()), ('svm', LinearSVC())]) print str(sum(cross_val_score(clf, data.data,data.target ))/3.0) + ' CountVec SVM' #array([ 0.55445545, 0.48 , 0.54081633]) clf = Pipeline([('vect', TfidfVectorizer()), ('svm', LinearSVC())]) print str(sum(cross_val_score(clf, data.data,data.target ))/3.0) + ' Tfidf SVM' #array([ 0.62376238, 0.57 , 0.6122449 ]) clf_sgdc = Pipeline([('vect', CountVectorizer()),('clf', linear_model.SGDClassifier()),]) print str(sum(cross_val_score(clf_sgdc, data.data,data.target ))/3.0) + ' SGDC'
def load_SRAA(AVI_HOME='./SRAA/partition1/data', percent=1./3, rnd=2342, \ vect=CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))): data = load_files(AVI_HOME, encoding="latin1", load_content=True, random_state=rnd) data.data = [remove_header_subject(text) for text in data.data] indices = ShuffleSplit(len(data.data), n_iter=1, test_size=percent, indices=True, random_state=rnd) for train_ind, test_ind in indices: data = bunch.Bunch(train=bunch.Bunch(data=[data.data[i] for i in train_ind], target=data.target[train_ind]), test=bunch.Bunch(data=[data.data[i] for i in test_ind], target=data.target[test_ind])) X_tr = vect.fit_transform(data.train.data) y_tr = data.train.target X_te = vect.transform(data.test.data) y_te = data.test.target # cache the files pickle.dump(X_tr, open('SRAA_X_train.pickle', 'wb')) pickle.dump(y_tr, open('SRAA_y_train.pickle', 'wb')) pickle.dump(X_te, open('SRAA_X_test.pickle', 'wb')) pickle.dump(y_te, open('SRAA_y_test.pickle', 'wb')) pickle.dump(data.train.data, open('SRAA_X_train_corpus.pickle', 'wb')) pickle.dump(data.test.data, open('SRAA_X_test_corpus.pickle', 'wb')) pickle.dump(vect.get_feature_names(), open('SRAA_feature_names.pickle', 'wb')) return (X_tr, y_tr, X_te, y_te, data.train.data, data.test.data)
def text_classifly_twang(dataset_dir_name, fs_method, fs_num): print 'Loading dataset, 80% for training, 20% for testing...' movie_reviews = load_files(dataset_dir_name) doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0) print 'Feature selection...' print 'fs method:' + fs_method, 'fs num:' + str(fs_num) vectorizer = CountVectorizer(binary = True) word_tokenizer = vectorizer.build_tokenizer() doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train] term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num] print 'Building VSM model...' term_dict = dict(zip(term_set_fs, range(len(term_set_fs)))) vectorizer.fixed_vocabulary = True vectorizer.vocabulary_ = term_dict doc_train_vec = vectorizer.fit_transform(doc_str_list_train) doc_test_vec= vectorizer.transform(doc_str_list_test) clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) #调用MultinomialNB分类 doc_test_predicted = clf.predict(doc_test_vec) acc = np.mean(doc_test_predicted == doc_class_list_test) print 'Accuracy: ', acc return acc
def test_default_load_files(test_category_dir_1, test_category_dir_2, load_files_root): res = load_files(load_files_root) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.data, [b("Hello World!\n")])
def testdata_stats(): test_dataset = datasets.load_files(project_root+"/testdata", encoding='utf-8', decode_error='ignore') # save_thing_to_file(test_dataset, "test_dataset.txt") bayes = get_thing_from_file("bayes.txt") bayes.fit(test_dataset.data, test_dataset.target) predicted_nb = bayes.predict(test_dataset.data) print "*****BAYESIAN STATS****" print "average accuracy = " + \ str(numpy.mean(predicted_nb == test_dataset.target)) print(metrics.classification_report(test_dataset.target, predicted_nb, target_names=test_dataset.target_names)) print "*****BAYESIAN CONFUSION MATRIX*****" print metrics.confusion_matrix(test_dataset.target, predicted_nb) svm = get_thing_from_file("svm.txt") svm.fit(test_dataset.data, test_dataset.target) predicted_svm = svm.predict(test_dataset.data) print "*****SVM STATS*****" print "average accuracy = " + \ str(numpy.mean(predicted_svm == test_dataset.target)) print(metrics.classification_report(test_dataset.target, predicted_svm, target_names=test_dataset.target_names)) print "*****SVM CONFUSION MATRIX*****" print metrics.confusion_matrix(test_dataset.target, predicted_svm)
def test_load_files_wo_load_content( test_category_dir_1, test_category_dir_2, load_files_root): res = load_files(load_files_root, load_content=False) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.get('data'), None)
def runClassifiers (dataDir): data = load_files(dataDir) nbClassifier = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('classifier', MultinomialNB())]) parameters = {'vect__ngram_range': [(1,1),(2,2),(3,3),(1,2),(1,3)], 'vect__binary': [True, False], 'tfidf__use_idf': [True, False], 'classifier__alpha': [1e-2, 1e-3]} gs = GridSearchCV(nbClassifier, parameters, n_jobs=-1, verbose=1) gs.fit(data.data, data.target) best_parameters = gs.best_estimator_.get_params() print("Best score: %0.3f" % gs.best_score_) for params, mean_score, scores in gs.grid_scores_: print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)) for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) print("Done") pass
def __init__(self, file_path): self.training_documents = load_files(container_path='./20news-bydate/20news-bydate-train', categories=CATEGORIES, decode_error='ignore', shuffle=True, encoding='utf-8', random_state=42) self.test_documents = load_files(container_path='./20news-bydate/20news-bydate-test', categories=CATEGORIES, decode_error='ignore', shuffle=True, encoding='utf-8', random_state=42) self.file_path = file_path
def text_sentiment(docs_new): docs_new=[docs_new] twenty_train= load_files('./Sentiment') #the complete data is in this directory; like comp.graphics etc count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # Fit a classifier on the training set #clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) #f = open('my_classifier.pickle', 'wb') #pickle.dump(clf, f) #f = open('my_classifier.pickle',) #clf = pickle.load(f) #f.close() # save the classifier #with open('my_sentiment.pkl', 'wb') as fid: #cPickle.dump(clf, fid) # load it again with open('my_sentiment.pkl', 'rb') as fid: clf = cPickle.load(fid) X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) return twenty_train.target_names[predicted]
def get_inbin_depth(which_set): if which_set not in ('train', 'test'): raise ValueError data_dir = os.path.join(here, '../data/inbin_depth_{0}'.format(which_set)) data = load_files(data_dir, load_content=False, shuffle=False) return data
def train(self): """Loading and Training classifier""" # Load dataset categories = ['neg', 'pos'] self.train_set = load_files('resources/sentimentDataset/train/', categories=categories, encoding='latin-1') self.test_set = load_files('resources/sentimentDataset/test/', categories=categories, encoding='latin-1') #Tokenizing text with scikit-learn count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(self.train_set.data) # occurrences to frequencies tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # Pipline self.text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),]) self.text_clf.fit(self.train_set.data, self.train_set.target)
def test_load_files_w_categories_desc_and_encoding( test_category_dir_1, test_category_dir_2, load_files_root): category = os.path.abspath(test_category_dir_1).split('/').pop() res = load_files(load_files_root, description="test", categories=category, encoding="utf-8") assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 1) assert_equal(res.DESCR, "test") assert_equal(res.data, [u("Hello World!\n")])
def load_data(data_path, data_categories): return load_files(container_path=data_path, description=None, categories=data_categories, load_content=True, shuffle=True, encoding='latin-1', decode_error='strict', random_state=randint(0, 999999))
def test_default_load_files(): try: setup_load_files() res = load_files(LOAD_FILES_ROOT) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.data, [b("Hello World!\n")]) finally: teardown_load_files()
def test_load_files_wo_load_content(): try: setup_load_files() res = load_files(LOAD_FILES_ROOT, load_content=False) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.get('data'), None) finally: teardown_load_files()
def build_histogram(path="./data", name="hist"): # here we create a Bunch object ['target_names', 'data', 'target', 'DESCR', 'filenames'] raw_bunch = datasets.load_files(path, description=None, categories=None, load_content=True, shuffle=True, encoding='utf-8', decode_error='replace') quantities = {author: 0 for author in list(raw_bunch['target_names'])} for i in list(raw_bunch['target']): quantities[list(raw_bunch['target_names'])[i]]+=1 plt.figure(figsize=(17, 7), dpi=80, facecolor='w', edgecolor='k') plt.bar(range(len(quantities)), quantities.values(), align='center') plt.xticks(range(len(quantities)), quantities.keys()) savefig(name + '.png')
from sklearn.feature_extraction.text import TfidfTransformer from sklearn.model_selection import train_test_split from sklearn.metrics import fbeta_score, confusion_matrix from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier from keras.models import Sequential, Model from keras.layers import Dense from keras.regularizers import l1 from keras import backend as K import numpy as np documents = load_files('../TEXTDATA/', shuffle=False) # Split remainder into training and testing X_train, X_test, y_train, y_test = train_test_split(documents.data, documents.target, test_size=0.15) #import code #code.interact(local=locals()) count_vect = CountVectorizer() X_train_count = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)
def test_default_empty_load_files(): res = load_files(LOAD_FILES_ROOT) assert_equal(len(res.filenames), 0) assert_equal(len(res.target_names), 0) assert_equal(res.DESCR, None)
def test_load_files_wo_load_content(): res = load_files(LOAD_FILES_ROOT, load_content=False) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.get('data'), None)
from nltk.corpus import stopwords import nltk import re import numpy as np from bs4 import BeautifulSoup import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import roc_auc_score import seaborn as sns Stop_words = stopwords.words('english') dataset = load_files('Web Page Classification/') #datasetimizi oluşturuyoruz #X:dataset #Y:Classlar X, y = dataset.data, dataset.target from nltk.stem import WordNetLemmatizer lem = WordNetLemmatizer() corpus = [] #Documentları corpus kaydediyoruz #fakat doha öncesinde preprocessing(boşlukların atılması,noktalama işretleri,a an ) for i in range(0, len(X)):
# movie reviews import numpy as np from sklearn.datasets import load_files from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.decomposition import TruncatedSVD from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn.metrics import f1_score resumedir = 'data/resumes_and_reviews' resumes = load_files(resumedir, shuffle=True) # Split remainder into training and testing X_train, X_test, y_train, y_test = train_test_split( resumes.data, resumes.target, test_size=0.50 ) count_vect = CountVectorizer() X_train_count = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_count) resume_matrix = X_train_tfidf[y_train == 0] top_words = [] svd = TruncatedSVD(n_components=20)
from nltk import wordpunct_tokenize from nltk import WordNetLemmatizer from sklearn.naive_bayes import MultinomialNB from nltk import sent_tokenize from nltk import pos_tag from sklearn.base import BaseEstimator, TransformerMixin from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from nltk.stem import PorterStemmer from sklearn.linear_model import LogisticRegression ps = PorterStemmer() bookdir = r'Book' # loading all files as training data. book_train = load_files(bookdir, shuffle=True) #print(book_train.data) # target names ("classes") are automatically generated from subfolder names #print(book_train.target_names) #print(book_train.filenames) #nltk.download('sentiwordnet') stopwd = set(sw.words('english')) lemmatizer = WordNetLemmatizer() def lemmatize_texte( token, tag, normalize): tag = { 'N': wn.NOUN, 'V': wn.VERB,
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.decomposition import PCA import matplotlib.pyplot as plt from sklearn.datasets import load_files # for reproducibility random_state = 0 print( "[+] NLP Clustering by https://sanjayasubedi.com.np/nlp/nlp-with-python-document-clustering/" ) DATA_DIR = "./bbc/" print("[+] Load files") data = load_files(DATA_DIR, encoding="utf-8", decode_error="replace", random_state=random_state) df = pd.DataFrame(list(zip(data['data'], data['target'])), columns=['text', 'label']) df.head() print("[+] Calculate tf-id") # Learn vocabolary vec = TfidfVectorizer(stop_words="english") vec.fit(df.text.values) print("[+] Define cluster") # Crea modello cls = MiniBatchKMeans(n_clusters=5, random_state=random_state) features = vec.transform(df.text.values) cls.fit(features)
def load_dataset(path): data = load_files(path) dog_files = np.array(data['filenames']) dog_targets = np_utils.to_categorical(np.array(data['target']), 120) return dog_files, dog_targets
from sklearn.neighbors import KNeighborsClassifier stemmer = EnglishStemmer() def stemming_tokenizer(text): stemmed_text = [ stemmer.stem(word) for word in word_tokenize(text, language='english') ] return stemmed_text data_folder_training_set = "./Training" data_folder_test_set = "./Test" training_dataset = load_files(data_folder_training_set) test_dataset = load_files(data_folder_test_set) print(training_dataset.target_names) # Load Training-Set X_train, X_test_DUMMY_to_ignore, Y_train, Y_test_DUMMY_to_ignore = train_test_split( training_dataset.data, training_dataset.target, test_size=0.0) target_names = training_dataset.target_names # Load Test-Set X_train_DUMMY_to_ignore, X_test, Y_train_DUMMY_to_ignore, Y_test = train_test_split( test_dataset.data, test_dataset.target, train_size=0.0) target_names = training_dataset.target_names print(Y_train.shape) print(Y_test.shape)
# Author: Olivier Grisel <*****@*****.**> # License: Simplified BSD # Adapted by: Francesco Mosconi import numpy as np from sklearn.datasets import load_files from sklearn.cross_validation import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import make_pipeline from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression # The training data folder must be passed as first argument try: dataset = load_files('./wikidata/short_paragraphs') except OSError as ex: print(ex) print( "Couldn't import the data, try running `python fetch_data.py` first ") exit(-1) # TASK: Split the dataset in training and test set # (use 20% of the data for test): x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.20, random_state=0) # TASK: Build a an vectorizer that splits strings into sequence of 1 to 3 # characters instead of word tokens using the class TfidfVectorizer
#coding:utf-8 """ Kmeans算法聚类文本示例 """ import matplotlib.pyplot as plt import numpy as np # 加载文本数据 from time import time from sklearn.datasets import load_files print("loading documents ...") t = time() docs = load_files('data/cluster_data') print("summary: {0} documents in {1} categories.".format( len(docs.data), len(docs.target_names))) print("done in {0} seconds".format(time() - t)) # 文本向量化表示 from sklearn.feature_extraction.text import TfidfVectorizer max_features = 20000 print("vectorizing documents ...") t = time() vectorizer = TfidfVectorizer(max_df=0.4, min_df=2, max_features=max_features, encoding='latin-1') X = vectorizer.fit_transform((d for d in docs.data)) print("n_samples: %d, n_features: %d" % X.shape) print("number of non-zero features in sample [{0}]: {1}".format( docs.filenames[0], X[0].getnnz()))
from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.model_selection import RandomizedSearchCV from sklearn import metrics from keras.models import Sequential, Model from keras.layers import Dense, Input from keras.optimizers import Adam resumes = load_files('data/actingAndManagerResumes/', shuffle=True) # Split remainder into training and testing X_train, X_test, y_train, y_test = train_test_split(resumes.data, resumes.target, test_size=0.20) count_vect = CountVectorizer() X_train_count = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_count) pca = TruncatedSVD(n_components=20) pca.fit(X_train_tfidf)
#!/usr/bin/env python # coding: utf-8 # # Project ML / DSA : Sentiment Analysis dengan Hybrid ANN-NB Classifier # ### Read Feature (Text) and Target (Rating) # In[1]: from sklearn.datasets import load_files import numpy as np reviews = load_files("dataset", encoding="ISO-8859-1") texts, rating = reviews.data, reviews.target # ### Preprocessing # In[2]: import nltk from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet def get_wordnet_pos(word): """Map POS tag to first character lemmatize() accepts""" tag = nltk.pos_tag([word])[0][1][0].upper() tag_dict = { "J": wordnet.ADJ, "N": wordnet.NOUN,
def load_dataset(path): data = load_files(path) medical_files = np.array(data['filenames']) medical_targets = np_utils.to_categorical(np.array(data['target']), 3) return medical_files, medical_targets
import numpy as np import re import nltk import matplotlib.pyplot as plt import seaborn as sns nltk.download('stopwords') nltk.download('wordnet') from sklearn.datasets import load_files import pickle from nltk.corpus import stopwords # In[2]: dataset = load_files(r"C:\Users\ozgur\Desktop\inputset2", encoding="utf-8") X = dataset.data y = dataset.target print(y) # In[3]: documents = [] from nltk.stem import WordNetLemmatizer stemmer = WordNetLemmatizer() for sen in range(0, len(X)): # Remove all the special characters document = re.sub(r'\W', ' ', str(X[sen]))
return c_clf # Out-of-core Training train_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'train') train_pos = os.path.join(train_path, 'pos') train_neg = os.path.join(train_path, 'neg') fnames = [os.path.join(train_pos, f) for f in os.listdir(train_pos)] +\ [os.path.join(train_neg, f) for f in os.listdir(train_neg)] y_train = np.zeros((len(fnames), ), dtype=int) y_train[:12500] = 1 np.bincount(y_train) sgd = SGDClassifier(loss='log', random_state=1) sgd = batch_train(clf=sgd, fnames=fnames, labels=y_train) # Testing test_path = os.path.join('datasets', 'IMDb', 'aclImdb', 'test') test = load_files(container_path=(test_path), categories=['pos', 'neg']) docs_test, y_test = test['data'][12500:], test['target'][12500:] vec = HashingVectorizer(encoding='latin-1') print('accuracy:', sgd.score(vec.transform(docs_test), y_test))
import shutil from sklearn import preprocessing from sklearn.datasets import load_files from sklearn.feature_extraction.text import TfidfVectorizer NUM_QUESTIONS = 3 PLOT_RESULTS = False ACTIVE = True DATA_FOLDER = "/home/af/Downloads/movie_review_kfold/review_polarity/activelearning" TRAIN_FOLDER = os.path.join(DATA_FOLDER, "train") TEST_FOLDER = os.path.join(DATA_FOLDER, "test") UNLABELED_FOLDER = os.path.join(DATA_FOLDER, "unlabeled") ENCODING = 'latin1' while True: data_train = load_files(TRAIN_FOLDER, encoding=ENCODING) data_test = load_files(TEST_FOLDER, encoding=ENCODING) data_unlabeled = load_files(UNLABELED_FOLDER, encoding=ENCODING) categories = data_train.target_names def size_mb(docs): return sum(len(s.encode('utf-8')) for s in docs) / 1e6 data_train_size_mb = size_mb(data_train.data) data_test_size_mb = size_mb(data_test.data) data_unlabeled_size_mb = size_mb(data_unlabeled.data) print("%d documents - %0.3fMB (training set)" % (len(data_train.data), data_train_size_mb)) print("%d documents - %0.3fMB (test set)" %
import sys from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model import Perceptron from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline from sklearn.datasets import load_files from sklearn.cross_validation import train_test_split from sklearn import metrics from sklearn.externals import joblib # The training data folder must be passed as first argument languages_data_folder = '/home/janrn/Development/machinelearning/articles' dataset = load_files(languages_data_folder) # Split the dataset in training and test set: docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5) # TASK: Build a an vectorizer clf = Pipeline([('vect', TfidfVectorizer(max_df=0.9, min_df=2)), ('clf', LinearSVC())]) # fit the pipeline on training data clf.fit(docs_train, y_train) # fit pipeline on all the data (no test) #clf.fit(dataset.data, dataset.target) # get category names # print dataset.target_names
def load_dataset(path): data = load_files(path) print(data) part_files = np.array(sorted(glob("data/*"))) return part_files
def load_dataset(path): data = load_files(path) mushroom_files = np.array(data['filenames']) mushroom_targets = np_utils.to_categorical(np.array(data['target']), 10) return mushroom_files, mushroom_targets
import copy import codecs as cs from sklearn.datasets import load_files from sklearn import svm from sklearn import metrics from random import shuffle from ficlearn.feature_extraction.text import BnsVectorizer if __name__ == '__main__': print("-----------------------------------------------") print("Load corpus and vectorize with BNSVectorizer") print("-----------------------------------------------") corpus = "corpus6" label_names = ['relevant', 'spam'] notices = load_files(corpus, categories=label_names, load_content=False) data = [ cs.open(filename, 'r', 'UTF-8').read() for filename in notices.filenames ] n_samples = len(data) Y = notices.target start = int(n_samples / 10) step = start recalls = [] precisions = [] sizes = [] N_SAMPLES = copy.deepcopy(n_samples) for i in range(2, 10, 1): sliceIndex = int((i * 0.1 + 0.1) * N_SAMPLES)
# Text Classifiation using NLP # Importing the libraries import numpy as np import re import pickle import nltk from nltk.corpus import stopwords from sklearn.datasets import load_files nltk.download('stopwords') # Importing the dataset reviews = load_files('class/') X, y = reviews.data, reviews.target # Pickling the dataset with open('X.pickle', 'wb') as f: pickle.dump(X, f) with open('y.pickle', 'wb') as f: pickle.dump(y, f) # Unpickling dataset X_in = open('X.pickle', 'rb') y_in = open('y.pickle', 'rb') X = pickle.load(X_in) y = pickle.load(y_in) # Creating the corpus corpus = [] for i in range(0, 2000):
from sklearn.datasets import load_digits from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB from sklearn.decomposition import PCA, TruncatedSVD from sklearn import datasets datasets.load_digits() cpath = 'D:\\kaggle\\basicshapes\\shapes\\' + 'circles' datasets.load_files(cpath) a = [1, 2, 3, 4, 5] print(len(a)) print(a[1]) a[2] = 8 print(a.append(88), a) a = [1, 2, 3, 4, 5] b = [1, 2, 3, 4, 5, 6] print(a == b) print(5 in b)
# categories = [ # 'alt.atheism', # 'talk.religion.misc', # 'comp.graphics', # 'sci.space', # ] # load dataset #print("Loading 20 newsgroups dataset for categories:") #print(categories if categories else "all") # data_train = fetch_20newsgroups(subset='train', categories=categories, # shuffle=True, random_state=42) #data_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42) #data_test = fetch_20newsgroups(subset='test',shuffle=True, random_state=42) data_train = load_files('train') data_test = load_files('test') print('data loaded') # order of labels in `target_names` can be different from `categories` target_names = data_train.target_names #data_train_size_mb = size_mb(data_train.data) #data_test_size_mb = size_mb(data_test.data) # print("%d documents - %0.3fMB (training set)" % ( # len(data_train.data), data_train_size_mb)) # print("%d documents - %0.3fMB (test set)" % ( # len(data_test.data), data_test_size_mb)) #print("%d categories" % len(categories)) #print()
from sklearn.datasets import load_files from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.svm import LinearSVC from sklearn.pipeline import Pipeline from sklearn import metrics language_data_folder = r'C:\Users\Imre\PycharmProjects\dirteszt\Webcrawl Excercise\crawl\data' dataset = load_files(language_data_folder, encoding='utf-8', shuffle=True) train_data, test_data, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.3) vect = TfidfVectorizer(max_df=0.95, min_df=3) my_clf = Pipeline([('vector', vect), ('clf', LinearSVC(C=1000))]) my_clf.fit(train_data, y_train) y_predicted = my_clf.predict(test_data) # print(help(metrics.classification_report)) # print(metrics.classification_report(y_test, y_predicted, labels=range(13), # target_names=dataset.target_names)) test_sentences = ['Az iráni látogatáson lévő White tavalyi letartóztatásáról egy emigráns szervezet adott először hírt\ , ami egy kiszabadult fogolytól azt tudta meg, hogy 2018 októberében, egy Meshed városában lévő börtönben találkozott\ vele.'] predicted = my_clf.predict(test_sentences)
def load_dataset(path): data = load_files(path) files = np.array(data['filenames']) targets = np.array(data['target']) target_labels = np.array(data['target_names']) return files, targets, target_labels
def load_dataset(path): data = load_files(path) fire_files = np.array(data['filenames']) fire_targets = np_utils.to_categorical(np.array(data['target']), num_classes) return fire_files, fire_targets
def test_default_load_files(): res = load_files(LOAD_FILES_ROOT) assert_equal(len(res.filenames), 1) assert_equal(len(res.target_names), 2) assert_equal(res.DESCR, None) assert_equal(res.data, [b("Hello World!\n")])
from sklearn.model_selection import GridSearchCV from sklearn.datasets import load_files from sklearn.model_selection import train_test_split from sklearn import metrics if __name__ == "__main__": # NOTE: we put the following in a 'if __name__ == "__main__"' protected # block to be able to use a multi-core grid search that also works under # Windows, see: http://docs.python.org/library/multiprocessing.html#windows # The multiprocessing module is used as the backend of joblib.Parallel # that is used when n_jobs != 1 in GridSearchCV # the training data folder must be passed as first argument movie_reviews_data_folder = sys.argv[1] dataset = load_files(movie_reviews_data_folder, shuffle=False) print("n_samples: %d" % len(dataset.data)) # split the dataset in training and test set: docs_train, docs_test, y_train, y_test = train_test_split( dataset.data, dataset.target, test_size=0.25, random_state=None) # TASK: Build a vectorizer / classifier pipeline that filters out tokens # that are too rare or too frequent pipeline = Pipeline([ ('vect', TfidfVectorizer(min_df=3, max_df=0.95)), ('clf', LinearSVC(C=1000)), ]) # TASK: Build a grid search to find out whether unigrams or bigrams are # more useful.
'''Extract TF-IDF features from corpus''' count_vectorizer = feature_extraction.text.CountVectorizer( lowercase=True, tokenizer=nltk.word_tokenize, # use the NLTK tokenizer stop_words='english', # remove stop words min_df=1 # minimum document frequency ) processed_corpus = count_vectorizer.fit_transform(corpus) processed_corpus = feature_extraction.text.TfidfTransformer().fit_transform( processed_corpus) return processed_corpus data_directory = 'tweets' tweets_sent_data = datasets.load_files(data_directory, shuffle=True) print('{} files loaded.'.format(len(tweets_sent_data.data))) print('They contain the following classes: {}.'.format( tweets_sent_data.target_names)) tweets_tfidf = extract_features(tweets_sent_data.data) X_train, X_test, y_train, y_test = model_selection.train_test_split( tweets_tfidf, tweets_sent_data.target, test_size=0.30, random_state=42) model = linear_model.LogisticRegression() model.fit(X_train, y_train) print('Model performance: {}'.format(model.score(X_test, y_test))) y_pred = model.predict(X_test)