def get_name_features(names): name = [] for i in names: s = re.findall('(?i)[a-z]{2,}', i) name.append(' '.join(s)) cv = CV(analyzer='char_wb', ngram_range=(3, 4)) fn = cv.fit_transform(name).toarray() return fn
def __init__(self, vectorizer=None, clf_model=None, use_tokens=False, forecast_attribute_name: str = "prediction", forecast_prob_attribute_name: str = "score"): super().__init__( forecast_attribute_name=forecast_attribute_name, forecast_prob_attribute_name=forecast_prob_attribute_name) if vectorizer is None: print("Initializing default unigram CountVectorizer...") if use_tokens: self.vectorizer = CV(decode_error='ignore', min_df=10, max_df=.5, ngram_range=(1, 1), binary=False, max_features=15000, tokenizer=lambda x: x, preprocessor=lambda x: x) else: self.vectorizer = CV(decode_error='ignore', min_df=10, max_df=.5, ngram_range=(1, 1), binary=False, max_features=15000) else: self.vectorizer = vectorizer if clf_model is None: print( "Initializing default classification model (standard scaled logistic regression)" ) self.clf_model = Pipeline([ ("standardScaler", StandardScaler(with_mean=False)), ("logreg", LogisticRegression(solver='liblinear')) ]) else: self.clf_model = clf_model
def string_vectorize(Xs_list): vc = CV(analyzer='char_wb', ngram_range=(3, 4), min_df=1, token_pattern='[a-z]{2,}') name = [] for i in Xs_list: s = re.findall('(?i)[a-z]{2,}', "".join(str(x) for x in i)) name.append(' '.join(s)) vc.fit(name) vec = vc.transform(name).toarray() dictionary = vc.get_feature_names() return vec, dictionary
def __init__(self, obj_type="utterance", text_func=None, cv=None, ngram_range=None, prior=0.1, class1_attribute_name='fighting_words_class1', class2_attribute_name='fighting_words_class2'): assert obj_type in ["speaker", "utterance", "conversation"] self.obj_type = obj_type if text_func is None: if obj_type == 'utterance': self.text_func = lambda utt: FightingWords.clean_text(utt.text) elif obj_type == 'conversation': self.text_func = lambda convo: \ FightingWords.clean_text(' '.join([utt.text for utt in convo.iter_utterances()])) else: self.text_func = lambda spkr: \ FightingWords.clean_text(' '.join([utt.text for utt in spkr.iter_utterances()])) else: self.text_func = text_func self.ngram_range = ngram_range self.prior = prior self.cv = cv self.ngram_zscores = None self._count_matrix = None if self.cv is None and type(self.prior) is not float: raise ValueError( "If using a non-uniform prior, you must pass a count vectorizer with " "the vocabulary parameter set.") if self.cv is None: print("Initializing default CountVectorizer", end=" ") if self.ngram_range is None: self.ngram_range = (1, 3) print("with ngram_range {}...".format(self.ngram_range), end=" ") self.cv = CV(decode_error='ignore', min_df=10, max_df=.5, ngram_range=self.ngram_range, binary=False, max_features=15000) print("Done.") self.class1_attribute_name = class1_attribute_name self.class2_attribute_name = class2_attribute_name
def __init__(self): self.db = self.connectDB() self.colList = [ "api_busan", "api_herald", "api_nocut", "api_ohmynews", "api_wikitree", "api_donga", "api_hangook", "api_joseon", "api_yeonhap", "api_joongang", ] self.Threshold = 1.7 self.data = [] self.newData = [] self.okt = Okt() self.vectorizer = CV(min_df=1) self.contents_all = []
def load_data(): #loading in file allData = open("patientdata.txt") data = allData.readlines() allData.close() cv = CV() data = cv.fit_transform(data) fitted = data.toarray() fitted = np.column_stack((fitted, labels)) training, validation, test = split(fitted) trainingLabel = training[:, -1] training = training[:, :-1] validationLabel = validation[:, -1] validation = validation[:, :-1] testLabel = test[:, -1] test = test[:, :-1] return training, validation, test, trainingLabel, validationLabel, testLabel, cv
def __init__(self, obj_type: str, vectorizer=None, vector_name="bow_vector", text_func: Callable[[CorpusObject], str] = lambda utt: utt.text): if vectorizer is None: print("Initializing default unigram CountVectorizer...") self.vectorizer = CV(decode_error='ignore', min_df=10, max_df=.5, ngram_range=(1, 1), binary=False, max_features=15000) else: self.vectorizer = vectorizer self.obj_type = obj_type self.vector_name = vector_name self.text_func = text_func
def __init__(self, class1_selector: Callable[[Utterance], bool], class2_selector: Callable[[Utterance], bool], cv=None, ngram_range=None, prior=0.1, threshold=1, top_k=10, annot_method="top_k", string_sanitizer=lambda str_: FightingWords.clean_text(str_)): """ :param class1_selector: selector function for identifying utterances that belong to class 1 :param class2_selector: selector function for identifying utterances that belong to class 2 :param cv: optional CountVectorizer. default: an sklearn CV with min_df=10, max_df=.5, and ngram_range=(1,3) with max 15000 features :param ngram_range: range of ngrams to use if using default cv :param prior: either a float describing a uniform prior, or a vector describing a prior over vocabulary items. If you're using a predefined vocabulary, make sure to specify that when you make your CountVectorizer object. :param threshold: the z-score threshold for annotating utterances with identified ngrams :param top_k: the top_k threshold for which ngrams to annotate utterances with :param annot_method: "top_k" or "threshold" to specify which annotation method to use in transform() and :param string_sanitizer: optional function for cleaning strings prior to fighting words analysis: uses default string sanitizer otherwise """ self.class1_selector = class1_selector self.class2_selector = class2_selector self.ngram_range = ngram_range self.prior = prior self.cv = cv self.threshold = threshold self.top_k = top_k assert annot_method in ["top_k", "threshold"] self.annot_method = annot_method self.ngram_zscores = None self.string_sanitizer = string_sanitizer self._count_matrix = None if self.cv is None and type(self.prior) is not float: raise ValueError("If using a non-uniform prior, you must pass a count vectorizer with " "the vocabulary parameter set.") if self.cv is None: print("Initializing default CountVectorizer...") if self.ngram_range is None: self.ngram_range = (1, 3) self.cv = CV(decode_error='ignore', min_df=10, max_df=.5, ngram_range=self.ngram_range, binary=False, max_features=15000)
def score_word_similiarity(i): seg_model = CV(binary=True, min_df=1, ngram_range=(1, n_gram_size_2), max_features=2000, lowercase=lowercase, tokenizer=tokenizer, token_pattern=token_pattern) vec_seg = seg_model.fit_transform(segments_sen[i]).toarray() similiarity_index = 0 seg_size = len(segments_sen[i]) print "here", seg_size - neighbour_limit, is_pure_seg[i], for j in range(seg_size - neighbour_limit): temp_similiarity_index = 0 for k in range(j, j + neighbour_limit): similiarity_index += sum(x[0] * x[1] for x in zip(vec_seg[j], vec_seg[k])) temp_similiarity_index += sum(x[0] * x[1] for x in zip(vec_seg[j], vec_seg[k])) print temp_similiarity_index, print "end" if similiarity_index > threshold_2: pure_segments.append(segments[i]) pure_data.extend(segments_sen[i]) else: mixed_segments.append(segments[i]) mixed_data.extend(segments_sen[i]) if is_pure_seg[i] == True: score_true.append(similiarity_index) if similiarity_index > threshold_2: pure.append(1) else: mixed.append(0) else: score_false.append(similiarity_index) if similiarity_index > threshold_2: pure.append(0) else: mixed.append(1)
def genvec_from_features(self, features): # ベクトル化 for em in features: self.vectors.append(" ".join(em)) #cv = CV(min_df=0.1, max_df=0.9) cv = CV() #tfidf = TFIDF() tf = cv.fit_transform(self.vectors) #matrix = tfidf.fit_transform(tf).toarray() matrix = tf.toarray() # ベクトルを正規化 x = np.array(list(map(np.linalg.norm, matrix))) # x = 0となる場合div zeroエラーが発生する.そもそも0ということは,特徴が存在していないためその行を削除する. nonzero = (x != 0) n_matrix = (matrix[nonzero].T / x[nonzero]).T self.dirlist = list(np.array(self.dirlist)[nonzero]) # 主成分分析する pca = PCA(n_components=self.pca_ncomponents) pca.fit(n_matrix) # 分析結果を元にデータセットを主成分に変換する transformed = pca.fit_transform(n_matrix) return self.dirlist, transformed
def __init__(self, obj_type: str, vector_name="bow_vector", text_func: Callable[[CorpusComponent], str] = None, vectorizer=None): if vectorizer is None: print("Initializing default unigram CountVectorizer...", end="") self.vectorizer = CV(decode_error='ignore', min_df=10, max_df=.5, ngram_range=(1, 1), binary=False, max_features=15000) print("Done.") else: self.vectorizer = vectorizer self.obj_type = obj_type self.vector_name = vector_name if text_func is None: if obj_type == "utterance": self.text_func = lambda utt: utt.text elif obj_type == "conversation": self.text_func = lambda convo: " ".join( utt.text for utt in convo.iter_utterances()) elif obj_type == "speaker": self.text_func = lambda speaker: " ".join( utt.text for utt in speaker.iter_utterances()) else: raise ValueError( "Invalid corpus object type. Use 'utterance', 'conversation', or 'speaker'" ) else: self.text_func = text_func
for sentence in sentences: doc = sentence.lower() #lower case doc = re.findall(r'[a-zA-Z]+', sentence) #removing numbers and special characters doc = [w for w in doc if not w in stop] #removing stopwords doc = " ".join(doc) stemmer = SnowballStemmer('english') clean = stemmer.stem(doc) clean_text.append(clean) return clean_text clean_text = cleaner(text) from sklearn.feature_extraction.text import CountVectorizer as CV cv = CV(ngram_range=(0, 2), encoding='latin', max_features=20000) X = cv.fit_transform(clean_text) features = cv.get_feature_names() dtm = pd.DataFrame(X.toarray(), columns=features) print dtm.shape from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(dtm, tags, test_size=0.1, random_state=40) from sklearn.ensemble import RandomForestClassifier print "Running Model" clfrf = RandomForestClassifier(200, n_jobs=-1, bootstrap=True) clfrf.fit(X_train, y_train)
# GROUPING AND CORRELATION print(df.groupby('stars').mean().head(5)) print(df.groupby('stars').mean().head(5).corr()) # sns.heatmap(df.groupby('stars').mean().head(5).corr(), annot=True) # plt.show() # CREATE YELP DF ONLY FOR 1 AND 5 STAR REVIEWS yelp_class = df[(df['stars'] == 1) | (df['stars'] == 5)] X = yelp_class['text'] y = yelp_class['stars'] # CREATE COUNT VECTORIZER AND FIT TO X cv = CV().fit(X) # OVERWRITE X WITH TRANSFORM X = cv.transform(X) # TRAIN TEST SPLIT X_train, X_test, y_train, y_test = TTS(X, y, test_size=0.3, random_state=64) # CREATE NAIVE BAYES OBJECT AND FIT nb = MNB().fit(X_train, y_train) pred = nb.predict(X_test) print(CR(y_test, pred)) # PIPELINE pipe = Pipeline([('CV', CV()), ('TFIDF', TT()), ("BAYES", MNB())])
# demonstrates ways to count unique words in Python from collections import Counter import nltk from sklearn.feature_extraction.text import CountVectorizer as CV text = 'ah list of ah words' t = text.split() c = Counter(t) # get unique words c.keys() c.most_common(10) fd = nltk.FreqDist(t) # unique words fd.keys() # you can plot the distribution easily fd.plot() # get words that occur at least a certain number of times more_than_once = [(f, c) for f, c in fd.items() if c > 1] fd.most_common(10) # this method is more useful for multiple documents vec = CV() res = vec.fit_transform([text]) # same result as counter/FreqDist here vec.vocabulary_ # get unique words vec.vocabulary_.keys()
def count_init(s140_train): # Fit the corpus to the CountVectorizer (bag of words) eng_words = np.genfromtxt("data/corpus.txt", dtype="str") CVec = CV() CVec.fit(eng_words) return CVec
review["overall"] reviews.append(Review(review["reviewText"], review["overall"])) # Split data from sklearn.model_selection import train_test_split training, test = train_test_split(reviews, test_size=.33, random_state=42) train_x = [x.text for x in training] train_y = [x.sentiment for x in training] test_x = [x.text for x in test] test_y = [x.sentiment for x in test] # Bags of words vectorization from sklearn.feature_extraction.text import CountVectorizer as CV vectorizer = CV() train_x_vectors = vectorizer.fit_transform(train_x) test_x_vectors = vectorizer.transform(test_x) #just transform for test data # Classification # there are many different classification methods from sklearn import svm clf_svm = svm.SVC(kernel="linear") # linear svm clf_svm.fit(train_x_vectors, train_y) from sklearn.tree import DecisionTreeClassifier # decision tree clf_dec = DecisionTreeClassifier() clf_dec.fit(train_x_vectors, train_y) from sklearn.naive_bayes import GaussianNB #Naive Bayes
xTest = [] yTest = [] for J in range(5): if J != I: xTrain.extend(XkFold[J]) yTrain.extend(YkFold[J]) else: testIndex = J xTrain = transform(xTrain) xTest = transform(XkFold[testIndex]) assert len(xTrain) == len(yTrain) xTrainNew, yTrainNew = balancedTrain(xTrain, yTrain, 'CV') counterList.append(CV(ngram_range=(2, 2), min_df=5)) trainVector = counterList[-1].fit_transform(xTrainNew) testVector = counterList[-1].transform(xTest) selectList.append(SelectKBest(chi2, k=min(10000, trainVector.shape[1]))) trainVector = selectList[-1].fit_transform(trainVector, yTrainNew) testVector = selectList[-1].transform(testVector) mreTotal.append(0) for J in clfOption: J.fit(trainVector, yTrainNew) prediction = J.predict(testVector) mreTotal[-1] += mrc(prediction, YkFold[testIndex])
def bayes_compare_language(l1, l2, ngram=1, prior=.01, cv=None): ''' Arguments: - l1, l2; a list of strings from each language sample - ngram; an int describing up to what n gram you want to consider (1 is unigrams, 2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed. - prior; either a float describing a uniform prior, or a vector describing a prior over vocabulary items. If you're using a predefined vocabulary, make sure to specify that when you make your CountVectorizer object. - cv; a sklearn.feature_extraction.text.CountVectorizer object, if desired. Returns: - A list of length |Vocab| where each entry is a (n-gram, zscore) tuple.''' if cv is None and type(prior) is not float: print("If using a non-uniform prior:") print( "Please also pass a count vectorizer with the vocabulary parameter set." ) quit() l1 = [basic_sanitize(l) for l in l1] l2 = [basic_sanitize(l) for l in l2] if cv is None: cv = CV(decode_error='ignore', min_df=10, max_df=.5, ngram_range=(1, ngram), binary=False, max_features=15000) counts_mat = cv.fit_transform(l1 + l2).toarray() # Now sum over languages... vocab_size = len(cv.vocabulary_) print("Vocab size is {}".format(vocab_size)) if type(prior) is float: priors = np.array([prior for i in range(vocab_size)]) else: priors = prior z_scores = np.empty(priors.shape[0]) count_matrix = np.empty([2, vocab_size], dtype=np.float32) count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis=0) count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis=0) a0 = np.sum(priors) n1 = 1. * np.sum(count_matrix[0, :]) n2 = 1. * np.sum(count_matrix[1, :]) print("Comparing language...") for i in range(vocab_size): #compute delta term1 = np.log((count_matrix[0, i] + priors[i]) / (n1 + a0 - count_matrix[0, i] - priors[i])) term2 = np.log((count_matrix[1, i] + priors[i]) / (n2 + a0 - count_matrix[1, i] - priors[i])) delta = term1 - term2 #compute variance on delta var = 1. / (count_matrix[0, i] + priors[i]) + 1. / (count_matrix[1, i] + priors[i]) #store final score z_scores[i] = delta / np.sqrt(var) index_to_term = {v: k for k, v in cv.vocabulary_.items()} sorted_indices = np.argsort(z_scores) return_list = [] for i in sorted_indices: return_list.append((index_to_term[i], z_scores[i])) return return_list
'''finding features and vectorising segments''' '''#########################################''' ''' model = model with feature words having atleast frequency = 3 = 11000 vec_seg(sparse matrix) = [ [0,0,1,1,0,1,1,1,1,0,0,0,0,1,1,... number of feature words=11000] [0,0,1,0,0,1,1,0,1,0,0,1,1,0,0,... whether word present or not] .... number of segments ] number_f_w = number of feature words extracted from merged data ''' model = CV(binary=True, min_df=3, ngram_range=(1, n_gram_size), max_features=20000, lowercase=lowercase, tokenizer=tokenizer, token_pattern=token_pattern) model = model.fit(merged_data) vec_seg = model.transform(segments) number_f_w = len(model.vocabulary_) vec_seg = vec_seg.toarray() max_features = min(max_features, number_f_w) print "number of feature words:", number_f_w print "STEP 2 done" '''######''' '''Step 2''' '''######''' '''############################################''' '''#################Step 3#####################'''
ps = PorterStemmer() clnd_msgs = [] #wnl = WordNetLemmatizer() for i in range(len(messages)): temp_msg = re.sub('[^a-zA-Z]',' ',messages['message'][i]) temp_msg = temp_msg.lower() temp_msg = temp_msg.split() temp_msg = [ps.stem(word) for word in temp_msg if word not in set(stopwords.words('english'))] # temp_msg = [wnl.lemmatize(word) for word in temp_msg if word not in set(stopwords.words('english'))] temp_msg = ' '.join(temp_msg) clnd_msgs.append(temp_msg) #Creating BagOfWords model from sklearn.feature_extraction.text import CountVectorizer as CV cv = CV(max_features = 5000) #selecting random 5k feaures or columns or words X = cv.fit_transform(clnd_msgs).toarray() ''' creating TF-IDF model from sklearn.feature_extraction.text import TfidfVectorizer as TV tv = TV(max_features = 5000)#selecting random 5k feaures or columns or words X = tv.fit_transform(clnd_msgs).toarray() ''' #Output data Y = pd.get_dummies(messages['label']) Y = Y.iloc[:,1].values #Train Test split from sklearn.model_selection import train_test_split
continue else: if len(word) > 1: features.append(ps.stem(word)) return features, sentiment with open(opath + "sentiment.txt", encoding='cp1252') as f: train_x = [] train_y = [] for line in f: x, y = getFeatures(line) train_x.append(' '.join(x)) train_y.append(1.0 if y == '+1' else 0.0) cv = CV() train_x_cv = cv.fit_transform(train_x) model = LogisticRegression() model.fit(train_x_cv, train_y) with open(opath + "sentiment.txt", encoding='cp1252') as f: tp = 0 tn = 0 fp = 0 fn = 0 for text in f: x = cv.transform([text]) y = model.predict(x) y_p = model.predict_proba(x) label = text[:2]
te_data_y = tr_data.target target_name = tr_data.target_names from sklearn.svm import SVC # def feature_work(data=None,vb=None,stop_words=None,max_df=1): # cv=CV(stop_words=stop_words,max_df=max_df,vocabulary=vb) # #print(cv.vocabulary) # tr_vb=cv.vocabulary_ # # tf=TF() # tf_idf=tf.fit_transform(cv.fit_transform(data))#词频和tfidf值 # print('0:',cv.fit_transform(data).shape) # print('1:', tf_idf.shape) # #word=cv.get_feature_names()#词文本的关键字 # #weight=tf_idf.toarray() # return tr_vb,tf_idf cv = CV(stop_words='english', max_df=0.8) tf = TF() tr_idf = tf.fit_transform(cv.fit_transform(tr_data_x)) #词频和tfidf值 print('0:', cv.fit_transform(tr_data_x).shape) te_idf = tf.fit_transform(cv.fit_transform(te_data_x)) #词频和tfidf值 print('1:', cv.fit_transform(te_data_x).shape) #train feature tf_tr是训练输入从tr_data_x处理得来,tr_data_y训练目标没有修改 #tr_vb,tf_tr=feature_work(tr_data_x,stop_words='english',max_df=0.5) #test feature #te_vb,tf_te=feature_work(te_data_x,vb=tr_vb) def getaccuracy(model=None, x=None, y_test=None, tar_name=None): y_pre = model.predict(x) print(classification_report(y_test, y_pre, target_names=tar_name))
# CLEAN UP MESS clean_mess = [word for word in nopunc.split() if word.lower() not in sw] print(clean_mess) # CREATE FUNCTION FOR CLEANING UP MESSAGES def text_process(mess): np = ''.join([c for c in mess if c not in string.punctuation]) return [word.lower() for word in np.split() if word.lower() not in sw] # EXAMPLE CLEAN OF HEAD OF MESSAGES print(df['msg'].head(5).apply(text_process)) # INSTANTIATE COUNT VECTORIZER AND FIT bow_transformer = CV().fit(df['msg']) # LENGTH OF VOCABULARY IN VECTORIZER print(len(bow_transformer.vocabulary_)) # GET MESSAGE NUMBER 4 UNANLTERED mess4 = df['msg'].iloc[3] print(mess4) # GET BAG OF WORDS FROM THE FITTED TRANSFORMER (VECTORIZER) bow4 = bow_transformer.transform([mess4]) print(bow4) print(bow4.shape) # SEE THE WORDS WHICH WHERE IN THE MESSAGE TWICE (THESE RESULTS ARE DIFFERENT FROM THE LECTURE - I SUSPECT THAT SCIKITLEARN WAS SIMPLY UPDATED)
そのためには、記事を読むこと、記事を書くことを通して、 読む側・書く側それぞれがお互いに関わり合って、 再利用性・汎用性の高い情報を育てていきましょう。 ''' txt = ''' Python is an interpreted high-level programming language for general-purpose programming. Created by Guido van Rossum and first released in 1991, Python has a design philosophy that emphasizes code readability, and a syntax that allows programmers to express concepts in fewer lines of code,notably using significant whitespace. It provides constructs that enable clear programming on both small and large scales. Python features a dynamic type system and automatic memory management. It supports multiple programming paradigms, including object-oriented, imperative, functional and procedural, and has a large and comprehensive standard library. Python interpreters are available for many operating systems. CPython, the reference implementation of Python, is open source software and has a community-based development model, as do nearly all of its variant implementations. CPython is managed by the non-profit Python Software Foundation. ''' source_list = [] for x in txt.split('\n'): if x is not '': source_list.append(x) # print(source_list) # cv = CV() cv = CV(stop_words="english", ngram_range=(1, 2)) matrix = cv.fit_transform(source_list) print(matrix) print(cv.get_feature_names())
from nltk.stem.porter import PorterStemmer as PS corpus = [] for i in range(0, 1000): review = re.sub('[^A-Za-z]', ' ', dataset['Review'][i]) review = review.lower() review = review.split() ps = PS() review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))] review = ' '.join(review) corpus.append(review) # Creating the Bag of Words model from sklearn.feature_extraction.text import CountVectorizer as CV cv = CV(max_features=1500) # Keeping only the 1500 most used words X = cv.fit_transform(corpus).toarray() y = dataset.iloc[:, 1].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) # Using Naives Bayes model # Fitting the Classifier to the Training set from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train, y_train)
print(re.findall(r'([a-zA-Z_-]+|\d+\.\d+|\d+)', )) from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) import re # s = 'RT @bugwannostra: @Louuu_ thx #FFFFs People power -_- works ❤signing… https://t.co/pl2bquE5Az' s = 'RT @bugwannostra: @Louuu_432 thx 6.3 #FF-FFs, People power -_- https:/ 2.34 234w.orks ❤signing… ht.tp: https:' re.findall(r'([a-zA-Z_-]+|\d+\.\d+|\d+)', s) from sklearn.feature_extraction.text import CountVectorizer as CV cv = CV(analyzer='word', token_pattern=r'([a-zA-Z_-]+|\d+\.\d+|\d+)', stop_words=stop_words, max_df=0.8, min_df=1e-5) # cv.fit_transform(textarr) # 这一行的效果和直接运行cProfile.run("foo()")的显示效果是一样的 # p.strip_dirs().sort_stats(-1).print_stats() # strip_dirs():从所有模块名中去掉无关的路径信息 # sort_stats():把打印信息按照标准的module/name/line字符串进行排序 # print_stats():打印出所有分析信息 # 按照在一个函数中累积的运行时间进行排序 # print_stats(3):只打印前3行函数的信息,参数还可为小数,表示前百分之几的函数信息 # python3.5 -m cProfile -o res event_extractor.py import pstats
# combine train and test sets train_set = train_neg_docs + train_pos_docs test_set = test_neg_docs + test_pos_docs # combine pos and neg sets pos_rev = clean_train_pos + clean_test_pos neg_rev = clean_train_neg + clean_test_neg # Count Vectorizer from sklearn.feature_extraction.text import CountVectorizer as CV import numpy as np import pandas as pd # get frequency counts word_frequency_vectorizer = CV(binary = False) # apply it to positive reviews pos_freq_dtm = word_frequency_vectorizer.fit_transform(pos_rev) # convert dtm to df pos_freq_df = pd.DataFrame(pos_freq_dtm.toarray(), columns = word_frequency_vectorizer.get_feature_names()) pos_freq_df.shape # get the most frequent words in pos and neg reviews from nltk import FreqDist from nltk.corpus import stopwords all_pos_words = [] filt_pos_words = []
def bayes_compare_language(l1, l2, ngram_range=(1, 3), prior=.01, cv=None, counts_mat=None): """ Parameters ---------- l1, l2 : Iterable[str] list of strings from each language sample ngram_range : Tuple[int, int], default=(1,3) an integer describing up to what n-gram you want to consider (1 is unigrams, 2 is bigrams + unigrams, etc). Ignored if a custom CountVectorizer is passed. prior : Union[float, array[float]] a float describing a uniform prior, or a vector describing a prior over vocabulary items. If you're using a predefined vocabulary, make sure to specify that when you make your CountVectorizer object. cv : Optional[sklearn.feature_extraction.text.CountVectorizer], default=None Pass this if you have pre-defined vocabulary. If None, by default an sklearn CV with min_df=10, max_df=.5, and ngram_range=(1,3) with max 15000 features. counts_mat : Optional[np.ndarray[len(l1 + l2), k]], default=None Counts matrix with size equal to length of `l1 + l2` (must also be in that order) and with k features. Pass this if you already have a dataset vectorized. If given, then the vectorizer must also be passed to `cv`. Returns ------- z_scores : pd.DataFrame A pandas DataFrame of shape (|Vocab|, 2) with (n-gram, z-score) pairs. array[array[float]]: A 2-row matrix of counts of terms in l1 and l2 respectively. """ if cv is None and type(prior) is not float: raise ValueError("If using a non-uniform prior, please also pass a count " "vectorizer with the vocabulary parameter set.") if counts_mat is not None: assert isinstance(cv, CV) # clean the text if counts_mat is None: logger.info('Basic cleaning of the text') l1 = [basic_sanitize(l) for l in l1] l2 = [basic_sanitize(l) for l in l2] # initialize count vectorizer if counts_mat is None: logger.info('Vectorizing documents with CountVectorizer') if cv is None: cv = CV(decode_error='ignore', min_df=10, max_df=.5, ngram_range=ngram_range, binary=False, max_features=15000) counts_mat = cv.fit_transform(l1 + l2).toarray() vocab_size = len(cv.vocabulary_) logger.info("Vocab size is {}".format(vocab_size)) # Now sum over languages... if type(prior) is float: priors = np.array([prior for i in range(vocab_size)]) else: priors = prior z_scores = np.empty(priors.shape[0]) count_matrix = np.empty([2, vocab_size], dtype=np.float32) count_matrix[0, :] = np.sum(counts_mat[:len(l1), :], axis=0) count_matrix[1, :] = np.sum(counts_mat[len(l1):, :], axis=0) a0 = np.sum(priors) n1 = 1. * np.sum(count_matrix[0,:]) n2 = 1. * np.sum(count_matrix[1,:]) logger.info("Comparing language...") for i in range(vocab_size): # compute delta term1 = np.log((count_matrix[0, i] + priors[i]) / (n1 + a0 - count_matrix[0, i] - priors[i])) term2 = np.log((count_matrix[1, i] + priors[i]) / (n2 + a0 - count_matrix[1, i] - priors[i])) delta = term1 - term2 # compute variance on delta var = 1. / (count_matrix[0, i] + priors[i]) + 1. / (count_matrix[1, i] + priors[i]) # store final score z_scores[i] = delta / np.sqrt(var) index_to_term = {v:k for k, v in cv.vocabulary_.items()} sorted_indices = np.argsort(z_scores) z_scores = pd.DataFrame([(index_to_term[i], z_scores[i]) for i in sorted_indices], columns=['term', 'z-score']) logger.info("Done") return z_scores, count_matrix
if settings.ALGORITHM: print("Data berhasil difetch") print("Data Train Unknown: %d Data Train: %d Data Test: %d" % (len(train_data_unknown), len(train_data), len(test_data))) filtered = [] for i in range(0, len(train_data_unknown), len(train_data)): X = [j[3] for j in train_data] y = [int(1) for j in train_data] to_evaluate = [ j[3] for j in train_data_unknown[i:i + len(train_data)] ] X += to_evaluate y += [int(0) for j in to_evaluate] counter = CV() vector = counter.fit_transform(clean_text(X)) to_evaluate_vector = counter.transform(clean_text(to_evaluate)) bayes = NB() bayes.fit(vector, y) predict = bayes.predict_proba(to_evaluate_vector) for j in range(len(predict)): if predict[j][1] > 0.9: filtered.append(train_data_unknown[i + j]) if settings.DEBUG_MODE: print("Data berhasil difilter") print("Data Filtered: ", len(filtered))
# object runs stem() func # set() will make bigger texts run faster review = [ ps.stem(word) for word in review if not word in set(stopwords.words('english')) ] # join the result into one string, by space review = ' '.join(review) corpus.append(review) # Creating the Bag of Words model: # After the "review" string, create cols for each word, and mark 1 of that word occurs in a string from sklearn.feature_extraction.text import CountVectorizer as CV # also removes any stop words, it could do above cleaning as well # would be 1565 (all the words from the corpus), we reduce to most sparse 1500 cv = CV(max_features=1500) # the sparse matrix and make it a matrix) X = cv.fit_transform(corpus).toarray() # And what is the dependend variable, need our 1 yay, or 0 nay y = dataset.iloc[:, 1].values # Splitting the dataset into the Training set and Test set from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0) # Fitting Naive Bayes to the Training set from sklearn.naive_bayes import GaussianNB classifier = GaussianNB()