from sklearn.feature_extraction import text from matplotlib.ticker import FormatStrFormatter from nltk.stem.lancaster import LancasterStemmer from nltk.tokenize.regexp import RegexpTokenizer # remove stem when calculating TF.IDF class Tokenizer(object): def __init__(self): self.tok=RegexpTokenizer(r'\b([a-zA-Z]+)\b') self.stemmer = LancasterStemmer() def __call__(self, doc): return [self.stemmer.stem(token) for token in self.tok.tokenize(doc)] # choose 8 required classes cat=['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] train = f20(subset='train',categories=cat, shuffle = True, random_state = 42) #printing number of documents per target(class) x=np.arange(0,9,1) fig, ax = plt.subplots() counts, bins, patches = ax.hist(train.target,x, facecolor='red', edgecolor='gray') ax.set_xticks(bins) ax.xaxis.set_major_formatter(FormatStrFormatter('%d')) ax.set_xlabel('Targets',x=1) ax.set_ylabel('Numbers') bin_centers = 0.5 * np.diff(bins) + bins[:-1] for count, x in zip(counts, bin_centers): # Label the raw counts ax.annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'), xytext=(0, -18), textcoords='offset points', va='top', ha='center')
analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: english_stemmer.stemWords(analyzer(doc)) cats = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey' ] print("Loading 20 newsgroups dataset for categories:") print '\n' print list(cats) print '\n' traindata = f20(subset='all', categories=cats) print "%d documents" % len(traindata.data) print "%d categories" % len(traindata.target_names) print '\n' print "Creating stemmed TFxIDF representation..." t0 = time() vect = StemmedTfidfVectorizer(stop_words='english') vectors = vect.fit_transform(traindata.data) # TFxIDF representation print "Done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % vectors.shape print '\n'
import nltk.stem from sklearn.svm import SVC from sklearn.metrics import roc_curve, auc, precision_recall_curve, confusion_matrix, precision_score, recall_score from sklearn import cross_validation from sklearn.cross_validation import KFold english_stemmer = nltk.stem.SnowballStemmer('english') class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(StemmedTfidfVectorizer, self).build_analyzer() return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) cat=['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'] train = f20(subset='train',categories=cat, shuffle = True, random_state = 42) ##train = f20(subset='train',shuffle = True, random_state = 42) stopwords = text.ENGLISH_STOP_WORDS vectorizer = StemmedTfidfVectorizer( min_df=1, stop_words=stopwords, decode_error='ignore') vector_train = vectorizer.fit_transform(train.data) tfidf_train=vector_train.toarray() svd = TSVD(n_components = 50, n_iter = 10, random_state = 42) tfidf_train_reduced = svd.fit_transform(tfidf_train) #print(tfidf_train.shape) #print(tfidf_train_reduced.shape) svm_train_data = tfidf_train_reduced #svm_train_tag = np.concatenate((-np.ones(len(train_comp.data)), np.ones(len(train_rect.data)))) svm_train_tag = []
return lambda doc: english_stemmer.stemWords(analyzer(doc)) #loading all the data cats = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware','comp.sys.mac.hardware', 'comp.windows.x', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'misc.forsale', 'talk.politics.misc','talk.politics.guns','talk.politics.mideast', 'talk.religion.misc','alt.atheism','soc.religion.christian'] print("Loading 20 newsgroups dataset for categories:") print '\n' print list(cats) print '\n' traindata = f20(subset='all') print "%d documents" % len(traindata.data) print "%d categories" % len(traindata.target_names) print '\n' print"Creating stemmed TFxIDF representation..." t0 = time() vect = StemmedTfidfVectorizer(stop_words='english') vectors = vect.fit_transform(traindata.data) # TFxIDF representation print"Done in %fs" % (time() - t0) print"n_samples: %d, n_features: %d" % vectors.shape print'\n'
""" import numpy as np from sklearn.datasets import fetch_20newsgroups as f20 from sklearn.feature_extraction import text from sklearn.feature_extraction.text import CountVectorizer from nltk.stem.lancaster import LancasterStemmer from nltk.tokenize.regexp import RegexpTokenizer import scipy class Tokenizer(object): def __init__(self): self.tok=RegexpTokenizer(r'\b([a-zA-Z]+)\b') self.stemmer = LancasterStemmer() def __call__(self, doc): return [self.stemmer.stem(token) for token in self.tok.tokenize(doc)] # combine the documents of the same classes into the same docuemnt, divided by ' ' train = f20(subset='train',shuffle = True, random_state = 42) datalist=[] for i in range(0,20): datalist.append('') for i in range(0,len(train.data)): datalist[train.target[i]]+=(' '+train.data[i]) # get the count vector stopwords = text.ENGLISH_STOP_WORDS vectorizer = CountVectorizer(tokenizer = Tokenizer(), stop_words=stopwords, min_df=1) vector = vectorizer.fit_transform(datalist) count=vector.toarray() # get the if and icf index={0:3,1:4,2:6,3:15}
#!/usr/bin/env python3.6 # -*- coding: utf-8 -*- #@Author: Yang Xiaojun from sklearn.datasets import fetch_20newsgroups as f20 from sklearn.feature_extraction.text import CountVectorizer as CV from sklearn.feature_extraction.text import TfidfTransformer as TF from sklearn.metrics import accuracy_score from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report tr_data = f20( subset='train', categories=['comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale']) # tr_data_x = tr_data.data tr_data_y = tr_data.target te_data = f20( subset='test', categories=['comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale']) # te_data_x = tr_data.data te_data_y = tr_data.target target_name = tr_data.target_names from sklearn.svm import SVC # def feature_work(data=None,vb=None,stop_words=None,max_df=1): # cv=CV(stop_words=stop_words,max_df=max_df,vocabulary=vb) # #print(cv.vocabulary) # tr_vb=cv.vocabulary_ # # tf=TF() # tf_idf=tf.fit_transform(cv.fit_transform(data))#词频和tfidf值 # print('0:',cv.fit_transform(data).shape) # print('1:', tf_idf.shape)