def test_tfidf_vectorizer(): ''' 停用词就是在分类中没有用的词,这些词一般词频 TF 高,但是 IDF 很低,起不到分类的作用。 为了节省空间和计算时间,我们把这些词作为停用词 stop words,告诉机器这些词不需要帮我计算 TfidfVectorizer stop_words list token_pattern 过滤规则 正则表达式 fit_transform后 vocabulary_ 词汇表 字典型 idf_ 返回idf值 stop_words_ 返回停用词表 :return: ''' tfidf_vec = TfidfVectorizer() print(tfidf_vec) documents = [ 'this is the bayes document', 'this is the second document', 'and the third one', 'is this the document' ] tfidf_matrix = tfidf_vec.fit_transform(documents) print(tfidf_vec.get_feature_names()) print(tfidf_vec.get_stop_words()) print(tfidf_vec.get_params()) print(tfidf_vec.vocabulary_) print(tfidf_matrix.toarray())
def document_tfid_parser(documents): # So we want to parse one single document # Het werkt niet met alle documenten # vectorizer = TfidfVectorizer() # X = vectorizer.fit_transform(documents) # print(vectorizer.get_feature_names()) # print(vectorizer.get_params()) # print(vectorizer.get_stop_words()) # print(X) # print(X.shape) sumOfDocuments = [] for document in documents: sumOfDocuments.append(str(document[0]).replace('_',' ')) # if len(document[0]) > 4: # vectoriser = TfidfVectorizer() # X = vectoriser.fit_transform(document) # print(vectoriser.get_feature_names()) # print(vectoriser.get_params()) # print(vectoriser.get_stop_words()) # print(X.shape) # print(X) # else: # pass if len(sumOfDocuments) > 4: vectoriser = TfidfVectorizer(max_df=0.7) X = vectoriser.fit_transform(sumOfDocuments) print(vectoriser.get_feature_names()) print(vectoriser.get_params()) print(vectoriser.get_stop_words()) print(X.shape) print(X) else: pass
def get_data_with_dandelion(self, relevance_threshold=0.75, min_df=2, gamma=0.89, filter=False): only_text, ent, data = self.get_data_with_abstract_2(relevance_threshold) entities_sparse = sparse.csr_matrix(ent) tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000, min_df=min_df, stop_words='english', strip_accents='unicode', use_idf=True, ngram_range=(1, 1), norm='l2', tokenizer=TextUtils.tokenize_and_stem) tfidf_matrix = tfidf_vectorizer.fit_transform(only_text) print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0], tfidf_matrix.shape[1]) print 'entities matrix dimension: %s x %s ' %(entities_sparse.shape[0], entities_sparse.shape[1]) print 'non zero elements in entities matrix: %s' \ % len(entities_sparse.data) '''print tfidf_matrix[tfidf_matrix > 0].mean() print tfidf_matrix[tfidf_matrix > 0].max() print entities_sparse[entities_sparse > 0].mean() print entities_sparse[entities_sparse > 0].max() print '#' * 80''' #print 'after balancing' tfidf_matrix = tfidf_matrix * 1 entities_sparse = entities_sparse * (1 - gamma) #print tfidf_matrix[tfidf_matrix > 0].mean() #print tfidf_matrix[tfidf_matrix > 0].max() #print entities_sparse[entities_sparse > 0].mean() #print entities_sparse[entities_sparse > 0].max() f_score_dict = self.labels_dict(data) params = tfidf_vectorizer.get_params() params['dandelion_entities'] = entities_sparse.shape[1] params['original_terms'] = tfidf_matrix.shape[0] params['gamma'] = gamma params['relevance_threshold'] = relevance_threshold params['classes'] = len(f_score_dict) params['tokenizer'] = 'TextUtils.tokenize_and_stem' del params['dtype'] params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean() return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict, params
class WrappedVectorizer: def __init__(self, sanitizer=None, sg_only=False, *args, **kwargs): self.sg_only = sg_only self.sanitizer = sanitizer self.vectorizer = TfidfVectorizer(*args, **kwargs) def fit(self, data, labels=None): if self.sg_only: if labels is None: raise Exception('fit: Labels cannot be None if sg_only=True') else: data = np.array(data)[np.array(labels) == 4] # print("fitting using %d data" % len(data)) if self.sanitizer is not None: data = self.sanitizer(data) self.vectorizer.fit(data) def transform(self, data): if self.sanitizer is not None: data = self.sanitizer(data) return self.vectorizer.transform(data) def fit_transform(self, data, labels): self.fit(data, labels) return self.transform(data) def set_params(self, **parameters): # treat our params for key in ['sg_only', 'sanitizer']: if key in parameters: setattr(self, key, parameters[key]) del parameters[key] # forward the remaining to the scikit vectorizer self.vectorizer.set_params(**parameters) # don't forget to return self # see https://stackoverflow.com/questions/28124366/can-gridsearchcv-be-used-with-a-custom-classifier return self def get_params(self, deep=True): if deep: return dict(**dict(sg_only=self.sg_only, sanitizer=self.sanitizer), **self.vectorizer.get_params()) else: return dict(sg_only=self.sg_only, sanitizer=self.sanitizer) def __repr__(self): return "WrappedVectorizer(%s)" % ", ".join( ["%s=%r" % t for t in self.get_params().items()])
class ColNormedTfidf(TransformerMixin): """ Model that derives tf-idf reweighted representations of utterances, which are normalized by column. Can be used in ConvoKit through the `ColNormedTfidfTransformer` transformer; see documentation of that transformer for further details. """ def __init__(self, **kwargs): if 'token_pattern' in kwargs: self.tfidf_model = TfidfVectorizer(**kwargs) else: self.tfidf_model = TfidfVectorizer(token_pattern=r'(?u)(\S+)', **kwargs) def fit(self, X, y=None): tfidf_vects_raw = self.tfidf_model.fit_transform(X) self.col_norms = sparse.linalg.norm(tfidf_vects_raw, axis=0) def transform(self, X): tfidf_vects_raw = self.tfidf_model.transform(X) tfidf_vect = tfidf_vects_raw / self.col_norms return tfidf_vect def fit_transform(self, X, y=None): self.fit(X, y) return self.transform(X) def get_feature_names(self): return self.tfidf_model.get_feature_names() def get_params(self, deep=True): return self.tfidf_model.get_params(deep=deep) def set_params(self, **params): return self.tfidf_model.set_params(**params) def load(self, dirname): self.tfidf_model = joblib.load( os.path.join(dirname, 'tfidf_model.joblib')) self.col_norms = np.load(os.path.join(dirname, 'tfidf_col_norms.npy')) def dump(self, dirname): try: os.mkdir(dirname) except: pass np.save(os.path.join(dirname, 'tfidf_col_norms.npy'), self.col_norms) joblib.dump(self.tfidf_model, os.path.join(dirname, 'tfidf_model.joblib'))
def create_model(x_train, y_train, x_test, y_test): """ Create a trained model using the best parameters. """ print("\nCREATING FINAL MODEL...") vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.9) print("vectorizer params:", vectorizer.get_params()) linear_svc = svm.LinearSVC(C=1.0, dual=True, loss="hinge", penalty="l2") print("linear svc params", linear_svc.get_params()) linear_svc_pipeline = Pipeline( steps=[("vectorizer", vectorizer), ("linear_svc", linear_svc)]) print("\nTRAINING FINAL MODEL...") linear_svc_pipeline.fit(x_train, y_train) print("\nPICKLING MODEL...") list_pickle = open("final_model/trained_linear_svc.pkl", "wb") pickle.dump(linear_svc_pipeline, list_pickle) list_pickle.close() print("\nUNPICKLING MODEL...") list_unpickle = open("final_model/trained_linear_svc.pkl", "rb") model = pickle.load(list_unpickle) list_unpickle.close() print("Detaiiled classification report for final model:") print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") y_true, y_pred = y_test, model.predict(x_test) print(model.score(x_test, y_test)) print(model.get_params) print( classification_report(y_true, y_pred, target_names=["negative", "neutral", "positive"])) print() print("Confusion matrix for final model:") print(confusion_matrix(y_true, y_pred)) print() print()
def tfidf(eventgroup_id, use_full, use_glove, overwrite): fname = f'data/representations/representation_tf-idf_{eventgroup_id}.pkl' if use_full: fname = f'data/representations/representation_tf-idf_{eventgroup_id}_full.pkl' path = Path(fname) if path.exists() and not overwrite: logger.info(f"file {path.as_posix()} exists") return logger.info(f"loading documents (full={use_full})") docs = docs_cache.get(eventgroup_id, use_full) total_docs = len(docs) token_sets = [] doc_ids = [] for doc_id, texts in tqdm(docs.items(), total=total_docs, desc="tokenizing docs"): doc = [] for token in tokenizer(' '.join([d.text for d in texts])): doc.append(token) if doc: token_sets.append(doc) doc_ids.append(doc_id) logger.info("applying tf-idf") vectorizer = TfidfVectorizer(tokenizer=identity, preprocessor=identity, dtype=np.float32) m = vectorizer.fit_transform(token_sets) logger.info("saving matrix") params = vectorizer.get_params() params.pop('preprocessor') params.pop('tokenizer') params.pop('dtype') params['name'] = 'tf-idf' joblib.dump((m, doc_ids, params), fname)
class TfIdfEncoder(Preprocessor): """Wrapper around tf-idf providing Preprocessor interface.""" def __init__(self, params=None): """Initialize TfIdfTermEncoder.""" if params is None: params = {} self.model = TfidfVectorizer(**params) def info(self): """Get model info.""" return self.model.get_params() def fit(self, data): """Fit the model.""" self.model.fit(data) def transform(self, data): """Transform the input data.""" return np.array(self.model.transform(data).todense())
def get_data_only_with_abstract(self, relevance_threshold=0.75, min_df=0.01, gamma=0.89, filter=False): only_text, ent, data = self.get_data_with_abstract_2(relevance_threshold) tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000, min_df=min_df, stop_words='english', strip_accents='unicode', use_idf=True, ngram_range=(1, 1), norm='l2', tokenizer=TextUtils.tokenize_and_stem) tfidf_matrix = tfidf_vectorizer.fit_transform(only_text) f_score_dict = self.labels_dict(data) params = tfidf_vectorizer.get_params() params['original_terms'] = tfidf_matrix.shape[0] params['gamma'] = gamma params['relevance_threshold'] = relevance_threshold params['classes'] = len(f_score_dict) params['tokenizer'] = 'TextUtils.tokenize_and_stem' return tfidf_matrix, f_score_dict, params
review = review.lower() review = review.split() review = [ps.stem(word) for word in review if word not in stopwords.words('english')] review = ' '.join(review) corpus.append(review) corpus[3] tfidf_v= TfidfVectorizer(max_features=2500, ngram_range=(1,3)) X =tfidf_v.fit_transform(corpus).toarray() from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0) tfidf_v.get_feature_names()[1:20] tfidf_v.get_params() data = pd.DataFrame(X_train,columns = tfidf_v.get_feature_names()) import numpy as np import matplotlib.pyplot as plt def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ See full source and example: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`.
#20 newsgroups (part of sklearn) print "loading 20 newsgroups dataset..." tic = time() dataset = fetch_20newsgroups(shuffle=True, random_state=0, remove=('headers','footers','quotes')) train_corpus = dataset.data # a list of 11314 documents / entries toc = time() print "elapsed time: %.4f sec" %(toc - tic) #compute tf-idf (equivalent to count-vectorizer followed by tf-idf transformer) #count-vectorizer produces term-document matrix tf-idf scales tf counts by log N/nt #(N:num of docs, nt: number of a word occurence in docs) #if float (proportion of docs): min_df < nt/N < max_df, if int: refers to count nt, e.g. min_df = 2 tfidf = TfidfVectorizer(max_features = num_features, max_df=0.95, min_df=2, stop_words = 'english') print "tfidf parameters:" print tfidf.get_params() #generate tf-idf term-document matrix A_tfidf_sp = tfidf.fit_transform(train_corpus) #size D x V print "number of docs: %d" %A_tfidf_sp.shape[0] print "dictionary size: %d" %A_tfidf_sp.shape[1] #tf-idf dictionary tfidf_dict = tfidf.get_feature_names() #fit LDA model print "Fitting LDA model..." lda_vb = LatentDirichletAllocation(n_topics = num_topics, max_iter=10, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1) tic = time()
# In[25]: y_test.shape # In[25]: tv.get_feature_names()[:20] # Top 20 feature names for this data set #, which shows 2 words and 3 words togather # # In[26]: tv.get_params() # will give details for the count vectorizer applied # # In[28]: # Data set after applying tyhe count vectporizer # df_count = pd.DataFrame(X,columns=tv.get_feature_names()) df_count.head(10) # In[29]: # Applying the Multionomial NB algorithm #
random_state=0, remove=('headers', 'footers', 'quotes')) train_corpus = dataset.data # a list of 11314 documents / entries toc = time() print "elapsed time: %.4f sec" % (toc - tic) #compute tf-idf (equivalent to count-vectorizer followed by tf-idf transformer) #count-vectorizer produces term-document matrix tf-idf scales tf counts by log N/nt #(N:num of docs, nt: number of a word occurence in docs) #if float (proportion of docs): min_df < nt/N < max_df, if int: refers to count nt, e.g. min_df = 2 tfidf = TfidfVectorizer(max_features=num_features, max_df=0.95, min_df=2, stop_words='english') print "tfidf parameters:" print tfidf.get_params() #generate tf-idf term-document matrix A_tfidf_sp = tfidf.fit_transform(train_corpus) #size D x V print "number of docs: %d" % A_tfidf_sp.shape[0] print "dictionary size: %d" % A_tfidf_sp.shape[1] #tf-idf dictionary tfidf_dict = tfidf.get_feature_names() #fit LDA model print "Fitting LDA model..." lda_vb = LatentDirichletAllocation(n_topics=num_topics, max_iter=10, learning_method='online',
import importlib from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from xgboost.sklearn import XGBClassifier from tinydb import TinyDB my_module = importlib.import_module('sklearn.feature_extraction.text') my_class = getattr(my_module, 'CountVectorizer') cv = my_class() print("Module name: {}".format(type(cv).__name__)) models = [MultinomialNB(), RandomForestClassifier(), SVC(), XGBClassifier()] params = {} tfidf = TfidfVectorizer() tfidf_params = {type(tfidf).__name__ + '__' + k:v for k,v in tfidf.get_params().items() if not inspect.isclass(v)} def dumper(obj): try: return obj.toJSON() except: return v0 = TinyDB('v0.json') if len(v0.tables()) > 0: v0.purge_tables() for model in models: table = v0.table(type(model).__name__) params = {} params['model_params'] = {} params['model_params'] = {type(model).__name__ + '__' + k:v for k,v in model.get_params().items() if not inspect.isclass(v)}
# generate dict{'word': score} with tf-idf corpus = [ '無料 ごはん おかず ごはん おかず ディナー クーポン クーポン 食事', '無料 ごはん おかず ごはん おかず ランチ クーポン クーポン 食事 昼飯', '' ] # tf : 1doc(=corpus[i])で計算 # idf: all docs(=corpus)で計算 # つまり # corpusの1要素を、1ユーザーの発言にすれば、 # 全ドキュメント=全ユーザーの発言になり、 # 全ユーザーに共通発言されているものはスコアが低くなる tfidf_vectorizer = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b', max_features=3000) feature_matrix = tfidf_vectorizer.fit_transform(corpus) print('feature_matrix >> ') print(feature_matrix) feature_matrix_arr = feature_matrix.toarray() print('feature_matrix_arr >> ') print(feature_matrix_arr) print('tfidf_vectorizer.vocabulary_ >> ') print(tfidf_vectorizer.vocabulary_) print('tfidf_vectorizer.get_feature_names() >> ') print(tfidf_vectorizer.get_feature_names()) print('tfidf_vectorizer.get_params() >> ') print(tfidf_vectorizer.get_params())
class FeatureBased(object): def __init__(self, train_path=None, valid_path=None, test_path=None, model='multinomial', feat='bow'): self.train_path = train_path self.valid_path = valid_path self.test_path = test_path logger.info('train_path: %s' % train_path) logger.info('valid_path: %s' % valid_path) logger.info('test_path: %s' % test_path) # define the model if model == 'multinomial': self.model = naive_bayes.MultinomialNB() self.model_name = 'multinomial' elif model == 'linreg': self.model = linear_model.LogisticRegression( random_state=1234, solver='lbfgs', multi_class='multinomial', penalty='l2') self.model_name = 'linear_model.LogisticRegression' elif model == 'linsvm': self.model = svm.LinearSVC(random_state=1234, tol=1e-5, penalty='l2') self.model_name = 'svm.LinearSVC' else: raise NotImplemetedError() if feat == 'bow': self.vectorizer = CountVectorizer(tokenizer=self.tokenizeText, ngram_range=(1, 1)) self.feat_name = 'bag_of_words' elif feat == 'uni-bi-gram': self.vectorizer = CountVectorizer(tokenizer=self.tokenizeText, ngram_range=(1, 2)) self.feat_name = 'uni_bi_grams' elif feat == 'tf': self.vectorizer = TfidfVectorizer(use_idf=False) self.feat_name = 'tf' elif feat == 'tfidf': self.vectorizer = TfidfVectorizer() self.feat_name = 'tfidf' else: raise NotImplemetedError() logger.info('feat name: %s' % self.feat_name) logger.info('feat vectorizer parameters: %s' % self.vectorizer.get_params()) logger.info('model name: %s' % self.model_name) logger.info('model parameters: %s' % self.model.get_params()) def prepare_data(self): train_x, train_y = self.load_data(self.train_path) logger.info('train data is loaded. #samples: %d, #labels:%d' % (len(train_x), len(train_y))) train_feat_vecs = self.text_to_features(train_x, is_trainset=True) logger.info('train_feat_vecs: %s' % str(train_feat_vecs.shape)) self.features = self.vectorizer.vocabulary_ # voc={word:id (feature_index)} logger.info('number of features: %d' % len(self.features)) train_label = self.text_to_label(train_y) self.train_data = (train_feat_vecs, train_label) valid_x, valid_y = self.load_data(self.valid_path) logger.info('valid data is loaded. #samples: %d, #labels:%d' % (len(valid_x), len(valid_y))) valid_feat_vecs = self.text_to_features(valid_x) logger.info('valid_feat_vecs: %s' % str(valid_feat_vecs.shape)) valid_label = self.text_to_label(valid_y) self.valid_data = (valid_feat_vecs, valid_label) test_x, test_y = self.load_data(self.test_path) logger.info('test data is loaded. #samples: %d, #labels:%d' % (len(test_x), len(test_y))) test_feat_vecs = self.text_to_features(test_x) logger.info('test_feat_vecs: %s' % str(test_feat_vecs.shape)) test_label = self.text_to_label(test_y) self.test_data = (test_feat_vecs, test_label) def load_data(self, data_path): with open(data_path, 'r') as f: lines = f.read().strip().split('\n') data_x = [] data_y = [] for line in lines: if len(line) > 0: act, utt = line.split(' ', 1) act = act.strip() utt = utt.strip() data_y.append(act) data_x.append(utt) return (data_x, data_y) def train(self): ''' train the model on the training data ''' self.model = self.model.fit(self.train_data[0], self.train_data[1]) def eval(self): ''' evaluate the model on the test data ''' train_pred = self.model.predict(self.train_data[0]) train_acc, train_f1 = self.metric(pred=train_pred, gold=self.train_data[1]) valid_pred = self.model.predict(self.valid_data[0]) valid_acc, valid_f1 = self.metric(pred=valid_pred, gold=self.valid_data[1]) test_pred = self.model.predict(self.test_data[0]) test_acc, test_f1 = self.metric(pred=test_pred, gold=self.test_data[1]) logger.info( 'train: (acc = %.2f%%, f1 = %.2f), valid: (acc = %.2f%%, f1 = %.2f) test: (acc = %.2f%%, f1 = %.2f)' % (train_acc, train_f1, valid_acc, valid_f1, test_acc, test_f1)) def metric(self, pred, gold): acc = metrics.accuracy_score(gold, pred) * 100 f1 = metrics.f1_score(gold, pred, average='weighted') * 100 return acc, f1 def predict(self, list_texts): feat_vecs = self.text_to_features(list_texts) label_pred = self.model.predict(feat_vecs) labels = {1: 'inform', 2: 'question', 3: 'directive', 4: 'commissive'} label_pred_string = [labels[l + 1] for l in label_pred] return label_pred, label_pred_string def text_to_label(self, data_y): labels = [int(label) - 1 for label in data_y] return labels def tokenizeText(self, sample): tokens = sample.split(' ') tokens = [token.lower().strip() for token in tokens if len(token) > 0] return tokens def text_to_features(self, data_x, is_trainset=False): ''' data: is a list of texts ''' feature_vectors = [] # bag of words if is_trainset: feature_vectors = self.vectorizer.fit_transform(data_x) else: feature_vectors = self.vectorizer.transform(data_x) return feature_vectors def save(self, model_path): model_path = model_path + '_' + self.model_name + '_' + self.feat_name model_vect_path = model_path + '_vectorizer' model_path += '.mdl' model_vect_path += '.mdl' with open(model_path, 'wb') as file: pickle.dump(self.model, file) with open(model_vect_path, 'wb') as file: pickle.dump(self.vectorizer, file) logger.info('model saved: %s' % model_path) logger.info('vectorizer saved: %s' % model_vect_path) def load(self, model_path): model_path = model_path + '_' + self.model_name + '_' + self.feat_name model_vect_path = model_path + '_vectorizer' model_path += '.mdl' model_vect_path += '.mdl' with open(model_path, 'rb') as file: self.model = pickle.load(file) with open(model_vect_path, 'rb') as file: self.vectorizer = pickle.load(file) logger.info('model loaded: %s' % model_path) logger.info('vectorizer loaded: %s' % model_vect_path)
def extract_features_age(docs_train, docs_val, docs_test, lsa=True): """Extract features This is basically a duplicate of the *extract_features_gender()* function, except that it does not use the PAN18AP test corpus as a second test set. """ # Build a vectorizer that splits strings into sequences of 1 to 3 words word_vectorizer = TfidfVectorizer(preprocessor=None, analyzer='word', ngram_range=(1, 3), max_features=10**5, min_df=2, use_idf=True, sublinear_tf=True) # Build a vectorizer that splits strings into sequences of 3 to 5 characters char_vectorizer = TfidfVectorizer(preprocessor=None, analyzer='char', ngram_range=(3, 5), max_features=10**5, min_df=2, use_idf=True, sublinear_tf=True) # Log the parameters of the word and character vectorizers logger.info('word_vectorizer: %s', word_vectorizer.get_params()) logger.info('char_vectorizer: %s', char_vectorizer.get_params()) # Build a transformer (vectorizer) pipeline using the previous analyzers # *FeatureUnion* concatenates results of multiple transformer objects ngrams_vectorizer = Pipeline([ ('feats', FeatureUnion([('word_ngram', word_vectorizer), ('char_ngram', char_vectorizer)])) ]) # Fit (learn vocabulary and IDF) and transform (transform documents to the TF-IDF matrix) the training set x_train_ngrams_tfidf = ngrams_vectorizer.fit_transform(docs_train) ''' ↳ Check the following attributes of each of the transformers (analyzers)—*word_vectorizer* and *char_vectorizer*: vocabulary_ : dict. A mapping of terms to feature indices. stop_words_ : set. Terms that were ignored ''' logger.info( '@ %.2f seconds: Finished fit_transforming the training dataset', time.process_time()) feature_names_ngrams = [ word_vectorizer.vocabulary_, char_vectorizer.vocabulary_ ] logger.info('Size of vocabulary: %s words | %s characters', format(len(word_vectorizer.vocabulary_), ',d'), format(len(char_vectorizer.vocabulary_), ',d')) # Vectorize each validation/test set # Extract the features of the validation/test sets (transform test documents to the TF-IDF matrix) # Only transform is called on the transformer (vectorizer), because it has already been fit to the training set. x_val_ngrams_tfidf = ngrams_vectorizer.transform(docs_val) logger.info('@ %.2f seconds: Finished transforming the validation set', time.process_time()) x_test_ngrams_tfidf = ngrams_vectorizer.transform(docs_test) logger.info('@ %.2f seconds: Finished transforming the test set', time.process_time()) logger.info( 'Word & character ngrams .shape = {training: %s | validation: %s | test: %s}', x_train_ngrams_tfidf.shape, x_val_ngrams_tfidf.shape, x_test_ngrams_tfidf.shape) # • Dimensionality reduction using truncated SVD (aka LSA) if lsa: # Build a truncated SVD (LSA) transformer object svd = TruncatedSVD(n_components=300, random_state=43) # Fit the LSA model and perform dimensionality reduction on the training set x_train_ngrams_tfidf_reduced = svd.fit_transform(x_train_ngrams_tfidf) logger.info( '@ %.2f seconds: Finished dimensionality reduction (LSA) on the training set', time.process_time()) # Perform dimensionality reduction on the validation and test sets # Note that the SVD (LSA) transformer is already fit on the training set x_val_ngrams_tfidf_reduced = svd.transform(x_val_ngrams_tfidf) logger.info( '@ %.2f seconds: Finished dimensionality reduction (LSA) on the validation set', time.process_time()) x_test_ngrams_tfidf_reduced = svd.transform(x_test_ngrams_tfidf) logger.info( '@ %.2f seconds: Finished dimensionality reduction (LSA) on the test set', time.process_time()) x_train = x_train_ngrams_tfidf_reduced x_val = x_val_ngrams_tfidf_reduced x_test = x_test_ngrams_tfidf_reduced else: x_train = x_train_ngrams_tfidf x_val = x_val_ngrams_tfidf x_test = x_test_ngrams_tfidf return x_train, x_val, x_test, feature_names_ngrams
def get_data_fabio(self, gamma=0.89, rank_metric='r'): data = self.mongo.get_all(order_by='id_doc') data = [doc for doc in data] only_text = [doc['text'] for doc in data] entitySet = set() for d in data: if 'isa' in d: for e in d['isa']: entitySet.add(e['entity']) current = np.zeros((len(data), len(entitySet)), dtype=np.float) count = 0 invIndex = {} countFeatures = 0 for i,d in enumerate(data): if 'isa' in d: for f in d['isa']: if f['entity'] not in invIndex: invIndex[f['entity']] = countFeatures countFeatures += 1 current[count, invIndex[f['entity']]] = f[rank_metric] count += 1 current = np.nan_to_num(current) current_sparse = sparse.csr_matrix(current) tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000, min_df=2, stop_words='english', strip_accents='unicode', use_idf=True, ngram_range=(1, 1), norm='l2', tokenizer=TextUtils.tokenize_and_stem) tfidf_matrix = tfidf_vectorizer.fit_transform(only_text) tfidf_matrix = tfidf_vectorizer.fit_transform(only_text) print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0], tfidf_matrix.shape[1]) print 'entities matrix dimension: %s x %s ' %(current_sparse.shape[0], current_sparse.shape[1]) print 'non zero elements in entities matrix: %s' \ % len(current_sparse.data) tfidf_matrix = tfidf_matrix * 1 entities_sparse = current_sparse * (1 - gamma) f_score_dict = self.labels_dict(data) params = tfidf_vectorizer.get_params() params['dandelion_entities'] = entities_sparse.shape[1] params['original_terms'] = tfidf_matrix.shape[0] params['gamma'] = gamma params['rank_metric'] = rank_metric params['classes'] = len(f_score_dict) params['tokenizer'] = 'TextUtils.tokenize_and_stem' del params['dtype'] params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean() return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict,\ params
class LDA(GenericModel): def __init__(self, **kwargs): self._corpus_matrix = None self._query_vector = None self.vectorizer = None self.lda_model = LatentDirichletAllocation(n_jobs=-1) super().__init__() self.similarity_measure = None self.set_basic_params(**kwargs) self.set_vectorizer(**kwargs) self.set_lda_model(**kwargs) def set_name(self, name): super().set_name(name) def set_model_gen_name(self, gen_name): super().set_model_gen_name(gen_name) def set_basic_params(self, **kwargs): self.set_name('LDA' if LDA_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[LDA_Model_Hyperp.NAME.value]) self.set_model_gen_name('lda') self.set_similarity_measure( sm.SimilarityMeasure.COSINE if LDA_Model_Hyperp.SIMILARITY_MEASURE. value not in kwargs.keys() else kwargs[LDA_Model_Hyperp. SIMILARITY_MEASURE.value]) def set_similarity_measure(self, sim_measure): self.similarity_measure = sim_measure def set_vectorizer(self, **kwargs): self.vectorizer = TfidfVectorizer( stop_words='english', use_idf=True, smooth_idf=True ) if LDA_Model_Hyperp.VECTORIZER.value not in kwargs.keys( ) else kwargs[LDA_Model_Hyperp.VECTORIZER.value] vec_params = { key.split('__')[2]: kwargs[key] for key, val in kwargs.items() if '__vectorizer__' in key } self.vectorizer.set_params(**vec_params) def set_lda_model(self, **kwargs): lda_model_params = { key.split('__')[2]: kwargs[key] for key, val in kwargs.items() if '__lda_model__' in key } self.lda_model.set_params(**lda_model_params) def recover_links(self, corpus, query, test_cases_names, bug_reports_names): self._corpus_matrix = self.vectorizer.fit_transform(corpus) self._query_vector = self.vectorizer.transform(query) self.out_1 = self.lda_model.fit_transform(self._corpus_matrix) self.out_2 = self.lda_model.transform(self._query_vector) metric = self.similarity_measure if metric == sm.SimilarityMeasure.COSINE: self._sim_matrix = pairwise.cosine_similarity(X=self.out_1, Y=self.out_2) elif metric == sm.SimilarityMeasure.JSD: self._sim_matrix = pairwise_distances(X=self.out_1, Y=self.out_2, metric=SimilarityMeasure.jsd) elif metric == sm.SimilarityMeasure.EUCLIDIAN_DISTANCE: self._sim_matrix = pairwise_distances(X=self.out_1, Y=self.out_2, metric='euclidean') #self._sim_matrix = super().normalize_sim_matrix(self._sim_matrix) self._sim_matrix = pd.DataFrame(data=self._sim_matrix, index=test_cases_names, columns=bug_reports_names) self._record_docs_feats(corpus, query, test_cases_names, bug_reports_names) def _record_docs_feats(self, corpus, query, test_cases_names, bug_reports_names): self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus) self.mrw_brs = self._recover_mrw_list(bug_reports_names, query) self.dl_tcs = self._recover_dl_list(test_cases_names, corpus) self.dl_brs = self._recover_dl_list(bug_reports_names, query) index = list(test_cases_names) + list(bug_reports_names) self.docs_feats_df = pd.DataFrame(index=index, columns=['mrw', 'dl']) for tc_name, mrw in self.mrw_tcs: self.docs_feats_df.at[tc_name, 'mrw'] = mrw for tc_name, dl in self.dl_tcs: self.docs_feats_df.at[tc_name, 'dl'] = dl for br_name, mrw in self.mrw_brs: self.docs_feats_df.at[br_name, 'mrw'] = mrw for br_name, dl in self.dl_brs: self.docs_feats_df.at[br_name, 'dl'] = dl def _recover_dl_list(self, artf_names, artf_descs): tokenizer = PorterStemmerBased_Tokenizer() dl_list = [] for artf_name, artf_desc in zip(artf_names, artf_descs): dl_list.append((artf_name, len(tokenizer.__call__(artf_desc)))) return dl_list def _recover_mrw_list(self, artf_names, artf_descs): N_REL_WORDS = 6 mrw_list = [] # list of tuples (artf_name, mrw_list={}) for artf_name, artf_desc in zip(artf_names, artf_descs): X = self.vectorizer.transform([artf_desc]) df1 = pd.DataFrame(X.T.toarray()) df1['token'] = self.vectorizer.get_feature_names() df1.sort_values(by=0, ascending=False, inplace=True) mrw = list(df1.iloc[0:N_REL_WORDS, 1].values) mrw_list.append((artf_name, mrw)) return mrw_list def model_setup(self): return { "Setup": [{ "Name": self.get_name() }, { "Similarity Measure and Minimum Threshold": self.get_sim_measure_min_threshold() }, { "Top Value": self.get_top_value() }, { "LDA Model": self.lda_model.get_params() }, { "Vectorizer": self.vectorizer.get_params() }, { "Vectorizer Type": type(self.vectorizer) }] } def get_name(self): return super().get_name() def get_model_gen_name(self): return super().get_model_gen_name() def get_similarity_measure(self): return self.similarity_measure def get_sim_matrix(self): return super().get_sim_matrix() def get_tokenizer_type(self): return type(self.tokenizer) def save_sim_matrix(self): super().save_sim_matrix() def get_query_vector(self): return self._query_vector def get_corpus_matrix(self): return self._corpus_matrix def get_vectorizer_type(self): return type(self.vectorizer) def print_topics(self): feature_names = self.vectorizer.get_feature_names() n_top_words = 10 for topic_idx, topic in enumerate(self.lda_model.components_): message = "Topic #%d: " % topic_idx message += " ".join([ feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] ]) print(message)
class LSI(GenericModel): def __init__(self, **kwargs): self._svd_matrix = None self._query_vector = None self.vectorizer = None self.svd_model = None super().__init__() self.similarity_measure = None self.set_basic_params(**kwargs) self.set_vectorizer(**kwargs) self.set_svd_model(**kwargs) def set_name(self, name): super().set_name(name) def set_model_gen_name(self, gen_name): super().set_model_gen_name(gen_name) def set_basic_params(self, **kwargs): self.set_name('LSI' if LSI_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[LSI_Model_Hyperp.NAME.value]) self.set_similarity_measure(SimilarityMeasure.COSINE) self.set_model_gen_name('lsi') def set_similarity_measure(self, sim_measure): self.similarity_measure = sim_measure def set_vectorizer(self, **kwargs): self.vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True) if LSI_Model_Hyperp.VECTORIZER.value not in kwargs.keys() else kwargs[LSI_Model_Hyperp.VECTORIZER.value] vec_params = {key.split('__')[2]:kwargs[key] for key,val in kwargs.items() if '__vectorizer__' in key} self.vectorizer.set_params(**vec_params) def set_svd_model(self, **kwargs): self.svd_model = TruncatedSVD(n_components = 100, algorithm = 'randomized', n_iter = 10, random_state = 42) if LSI_Model_Hyperp.SVD_MODEL.value not in kwargs.keys() else kwargs[LSI_Model_Hyperp.SVD_MODEL.value] svd_model_params = {key.split('__')[2]:kwargs[key] for key,val in kwargs.items() if '__svd_model__' in key} self.svd_model.set_params(**svd_model_params) def recover_links(self, corpus, query, test_cases_names, bug_reports_names): if self.similarity_measure == SimilarityMeasure.COSINE: self._recover_links_cosine(corpus, query, test_cases_names, bug_reports_names) elif self.similarity_measure == SimilarityMeasure.JACCARD_INDEX: self._recover_links_jaccard(corpus, query, test_cases_names, bug_reports_names) elif self.similarity_measure == SimilarityMeasure.EDIT_DISTANCE: self._recover_links_edit(corpus, query, test_cases_names, bug_reports_names) self._record_docs_feats(corpus, query, test_cases_names, bug_reports_names) def _record_docs_feats(self, corpus, query, test_cases_names, bug_reports_names): self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus) self.mrw_brs = self._recover_mrw_list(bug_reports_names, query) self.dl_tcs = self._recover_dl_list(test_cases_names, corpus) self.dl_brs = self._recover_dl_list(bug_reports_names, query) index = list(test_cases_names) + list(bug_reports_names) self.docs_feats_df = pd.DataFrame(index=index, columns=['mrw','dl']) for tc_name, mrw in self.mrw_tcs: self.docs_feats_df.at[tc_name, 'mrw'] = mrw for tc_name, dl in self.dl_tcs: self.docs_feats_df.at[tc_name, 'dl'] = dl for br_name, mrw in self.mrw_brs: self.docs_feats_df.at[br_name, 'mrw'] = mrw for br_name, dl in self.dl_brs: self.docs_feats_df.at[br_name, 'dl'] = dl def _recover_dl_list(self, artf_names, artf_descs): tokenizer = WordNetBased_LemmaTokenizer() dl_list = [] for artf_name, artf_desc in zip(artf_names, artf_descs): dl_list.append((artf_name, len(tokenizer.__call__(artf_desc)))) return dl_list def _recover_mrw_list(self, artf_names, artf_descs): N_REL_WORDS = 6 mrw_list = [] # list of tuples (artf_name, mrw_list={}) for artf_name, artf_desc in zip(artf_names, artf_descs): X = self.vectorizer.transform([artf_desc]) df1 = pd.DataFrame(X.T.toarray()) df1['token'] = self.vectorizer.get_feature_names() df1.sort_values(by=0, ascending=False, inplace=True) mrw = list(df1.iloc[0:N_REL_WORDS,1].values) mrw_list.append((artf_name, mrw)) return mrw_list def _recover_links_cosine(self, corpus, query, test_cases_names, bug_reports_names): svd_transformer = Pipeline([('vec', self.vectorizer), ('svd', self.svd_model)]) self._svd_matrix = svd_transformer.fit_transform(corpus) self._query_vector = svd_transformer.transform(query) self._sim_matrix = pairwise.cosine_similarity(X=self._svd_matrix, Y=self._query_vector) #self._sim_matrix = super().normalize_sim_matrix(self._sim_matrix) self._sim_matrix = pd.DataFrame(data=self._sim_matrix, index=test_cases_names, columns=bug_reports_names) def _recover_links_jaccard(self, corpus, query, test_cases_names, bug_reports_names): tokenizer = self.vectorizer.tokenizer corpus_tokens = [tokenizer.__call__(doc) for doc in corpus] query_tokens = [tokenizer.__call__(doc) for doc in query] self._sim_matrix = pd.DataFrame(index = test_cases_names, columns = bug_reports_names, data = np.zeros(shape=(len(test_cases_names), len(bug_reports_names)), dtype='int8')) for br_id, doc_query_tset in zip(bug_reports_names, query_tokens): for tc_id, doc_corpus_tset in zip(test_cases_names, corpus_tokens): self._sim_matrix.at[tc_id, br_id] = nltk.jaccard_distance(set(doc_corpus_tset), set(doc_query_tset)) def _recover_links_edit(self, corpus, query, test_cases_names, bug_reports_names): self._sim_matrix = pd.DataFrame(index = test_cases_names, columns = bug_reports_names, data = np.zeros(shape=(len(test_cases_names), len(bug_reports_names)), dtype='int8')) for br_id, doc_query in zip(bug_reports_names, query): for tc_id, doc_corpus in zip(test_cases_names, corpus): self._sim_matrix.at[tc_id, br_id] = nltk.edit_distance(doc_corpus, doc_query) normalizer = Normalizer(copy=False).fit(self._sim_matrix.values) self._sim_matrix = pd.DataFrame(data=normalizer.transform(self._sim_matrix.values), index=test_cases_names, columns=bug_reports_names) def model_setup(self): return {"Setup" : [ {"Name" : self.get_name()}, {"Similarity Measure" : self.get_similarity_measure()}, {"SVD Model" : self.svd_model.get_params()}, {"Vectorizer" : self.vectorizer.get_params()}, {"Vectorizer Type" : type(self.vectorizer)} ] } def get_query_vector(self): return self._query_vector def get_svd_matrix(self): return self._svd_matrix def get_vectorizer_type(self): return type(self.vectorizer) def get_tokenizer_type(self): return type(self.vectorizer.tokenizer) def get_name(self): return super().get_name() def get_model_gen_name(self): return super().get_model_gen_name() def get_similarity_measure(self): return self.similarity_measure def get_sim_matrix(self): return super().get_sim_matrix() def save_sim_matrix(self): super().save_sim_matrix()
from sklearn.feature_extraction.text import TfidfVectorizer corpus = [ 'This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?', ] #' And document first Is one second third this', vectorizer = TfidfVectorizer() x = vectorizer.fit_transform(corpus) print "get_feature_names:", vectorizer.get_feature_names() print "len:", len(vectorizer.get_feature_names()) print "get_stop_words:", vectorizer.get_stop_words() print "get_params:", vectorizer.get_params() print x.shape print "vocabulary:", vectorizer.vocabulary_ print "idf:", vectorizer.idf_ print x
often used in information retrieval and text mining. It's a statistical measure used to evaluate how important a word is to a document in a collection or a corpus. """ #Create Vectorizer parameters and fit the vectorizer to synopses. tfidf = TfidfVectorizer(max_df = 0.8, max_features = 2000, min_df = 0.2, stop_words = 'english', use_idf = True, tokenizer = tokenization_and_stemming, ngram_range = (1,1)) tfidf_matrix = tfidf.fit_transform(synoposes) #Save the terms identified by TF-IDF. tf_selected_words = tfidf.get_feature_names() #Print out the matrix, parameters and main features of the TF-IDF Vector. print("In total, there are " + str(tfidf_matrix.shape[0]) + " synoposes and " + str(tfidf_matrix.shape[1]) + " terms.") print("The Paramter of TFIDF Vector is: ", tfidf.get_params()) print() print("<TFIDF-Matrix>") print(tfidf_matrix) print() print("<Selected Feature Names>") print(tf_selected_words) print() #Calculate Document Similarity: from sklearn.metrics.pairwise import cosine_similarity cos_matrix = cosine_similarity(tfidf_matrix) print(cos_matrix) """
tfidf_matrix = tfidf_model.fit_transform(reviews) #fit the vectorizer to synopses print "In total, there are " + str(tfidf_matrix.shape[0]) + " summaries and " + str(tfidf_matrix.shape[1]) + " terms." # In[13]: tfidf_matrix[0] # In[14]: tfidf_model.get_params() # Save the terms identified by TF-IDF. # In[15]: tf_selected_words = tfidf_model.get_feature_names() # # (Optional) Calculate Document Similarity # In[16]:
cv = TfidfVectorizer(max_features=5000, ngram_range=(1, 3)) X = cv.fit_transform(corpus).toarray() y = messages.label ## training testing from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) ## get the parameters and feature name cv.get_feature_names() cv.get_params() ##final dataframwe final_df = pd.DataFrame(X_train, columns=cv.get_feature_names()) ## Creating machine learning model from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB() clf.fit(X_train, y_train) y_predict = clf.predict(X_test) ## measuring the performance of classifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix acc = accuracy_score(y_test, y_predict)
class VSM(GenericModel): def __init__(self, **kwargs): self._terms_matrix = None self._query_vector = None self.vectorizer = None self.svd_model = None super().__init__() self.similarity_measure = None self.set_basic_params(**kwargs) self.set_vectorizer(**kwargs) def set_name(self, name): super().set_name(name) def set_model_gen_name(self, gen_name): super().set_model_gen_name(gen_name) def set_basic_params(self, **kwargs): self.set_name('VSM' if VSM_Model_Hyperp.NAME.value not in kwargs.keys() else kwargs[VSM_Model_Hyperp.NAME.value]) self.set_similarity_measure(SimilarityMeasure.COSINE) self.set_model_gen_name('vsm') def set_similarity_measure(self, sim_measure): self.similarity_measure = sim_measure def set_vectorizer(self, **kwargs): self.vectorizer = TfidfVectorizer( stop_words='english', use_idf=True, smooth_idf=True ) if VSM_Model_Hyperp.VECTORIZER.value not in kwargs.keys( ) else kwargs[VSM_Model_Hyperp.VECTORIZER.value] vec_params = { key.split('__')[2]: kwargs[key] for key, val in kwargs.items() if '__vectorizer__' in key } self.vectorizer.set_params(**vec_params) def recover_links(self, corpus, query, test_cases_names, bug_reports_names): starttime = time.time() self._recover_links_cosine(corpus, query, test_cases_names, bug_reports_names) self._record_docs_feats(corpus, query, test_cases_names, bug_reports_names) endtime = time.time() print( f' ..Total processing time: {round(endtime-starttime, 2)} seconds', ) def _record_docs_feats(self, corpus, query, test_cases_names, bug_reports_names): self.mrw_tcs = self._recover_mrw_list(test_cases_names, corpus) self.mrw_brs = self._recover_mrw_list(bug_reports_names, query) self.dl_tcs = self._recover_dl_list(test_cases_names, corpus) self.dl_brs = self._recover_dl_list(bug_reports_names, query) index = list(test_cases_names) + list(bug_reports_names) self.docs_feats_df = pd.DataFrame(index=index, columns=['mrw', 'dl']) for tc_name, mrw in self.mrw_tcs: self.docs_feats_df.at[tc_name, 'mrw'] = mrw for tc_name, dl in self.dl_tcs: self.docs_feats_df.at[tc_name, 'dl'] = dl for br_name, mrw in self.mrw_brs: self.docs_feats_df.at[br_name, 'mrw'] = mrw for br_name, dl in self.dl_brs: self.docs_feats_df.at[br_name, 'dl'] = dl def _recover_dl_list(self, artf_names, artf_descs): tokenizer = WordNetBased_LemmaTokenizer() dl_list = [] for artf_name, artf_desc in zip(artf_names, artf_descs): dl_list.append((artf_name, len(tokenizer.__call__(artf_desc)))) return dl_list def _recover_mrw_list(self, artf_names, artf_descs): N_REL_WORDS = 6 mrw_list = [] # list of tuples (artf_name, mrw_list={}) for artf_name, artf_desc in zip(artf_names, artf_descs): X = self.vectorizer.transform([artf_desc]) df1 = pd.DataFrame(X.T.toarray()) df1['token'] = self.vectorizer.get_feature_names() df1.sort_values(by=0, ascending=False, inplace=True) mrw = list(df1.iloc[0:N_REL_WORDS, 1].values) mrw_list.append((artf_name, mrw)) return mrw_list def _recover_links_cosine(self, corpus, query, test_cases_names, bug_reports_names): transformer = Pipeline([('vec', self.vectorizer)]) self._terms_matrix = transformer.fit_transform(corpus) self._query_vector = transformer.transform(query) self._sim_matrix = pairwise.cosine_similarity(X=self._terms_matrix, Y=self._query_vector) #self._sim_matrix = super().normalize_sim_matrix(self._sim_matrix) self._sim_matrix = pd.DataFrame(data=self._sim_matrix, index=test_cases_names, columns=bug_reports_names) def model_setup(self): return { "Setup": [{ "Name": self.get_name() }, { "Similarity Measure": self.get_similarity_measure() }, { "Vectorizer": self.vectorizer.get_params() }, { "Vectorizer Type": type(self.vectorizer) }] } def get_query_vector(self): return self._query_vector def get_terms_matrix(self): return self._terms_matrix def get_vectorizer_type(self): return type(self.vectorizer) def get_tokenizer_type(self): return type(self.vectorizer.tokenizer) def get_name(self): return super().get_name() def get_model_gen_name(self): return super().get_model_gen_name() def get_similarity_measure(self): return self.similarity_measure def get_sim_matrix(self): return super().get_sim_matrix() def save_sim_matrix(self): super().save_sim_matrix()
def train(x_train, y_train, x_test, y_test): ######################## INIT ################################### print("\nINITIALIZING CLASSIFIER...") # vectorizer = TfidfVectorizer(ngram_range=(1,2)) vectorizer = TfidfVectorizer() print("vectorizer params:", vectorizer.get_params()) linear_svc = svm.LinearSVC() print("linear svc params", linear_svc.get_params()) linear_svc_pipeline = Pipeline( steps=[("vectorizer", vectorizer), ("linear_svc", linear_svc)]) ################### CROSS VAL and GRID SEARCH #################### print("\nPERFORMING GRID SEARCH WITH CROSS VALIDATION...") k_fold = KFold(n_splits=20, shuffle=True, random_state=1) # k_fold = KFold(n_splits=5, shuffle=True) linear_svc_params = [ { # Dual optimization "linear_svc__penalty": ["l2"], # if l1, you can't use hinge "linear_svc__loss": ["hinge", "squared_hinge"], "linear_svc__dual": [True], # if false, you can't use l2 or hinge # "linear_svc__tol": [1e-4, 1e-5], # "linear_svc__C":[0.001, 0.01, 0.1, 1, 10, 100, 1000], "linear_svc__C": [0.1, 1], # "linear_svc__multi_class": ["ovr", "crammer_singer"], # "vectorizer__stop_words": [None, stop_words.ENGLISH_STOP_WORDS], "vectorizer__ngram_range": [(1, 2), (1, 3)], "vectorizer__max_df": [0.9, 1.0], # "vectorizer__use_idf": [True, False] }, { # Primal Optimization "linear_svc__penalty": ["l1", "l2"], "linear_svc__loss": ["squared_hinge"], "linear_svc__dual": [False], # "linear_svc__tol": [1e-4, 1e-5], # "linear_svc__C":[0.001, 0.01, 0.1, 1, 10, 100, 1000], "linear_svc__C": [0.1, 1], # "linear_svc__multi_class": ["ovr", "crammer_singer"], # "vectorizer__stop_words": [None, stop_words.ENGLISH_STOP_WORDS], "vectorizer__ngram_range": [(1, 2), (1, 3)], "vectorizer__max_df": [0.9, 1.0], # "vectorizer__use_idf": [True, False] } # , # { # # "linear_svc__penalty": ["l2"], # if l1, you can't use hinge # "linear_svc__loss": ["hinge", "squared_hinge"], # # "linear_svc__dual": [True], # if false, you can't use l2 or hinge # "linear_svc__tol": [1e-4, 1e-5], # "linear_svc__C":[0.1, 1, 10], # # "vectorizer__stop_words": [None, stop_words.ENGLISH_STOP_WORDS] # "vectorizer__ngram_range": [(1,1), (1,2), (1,3)], # "vectorizer__max_df": [0.9, 1.0] # # "vectorizer__use_idf": [True, False] # # "linear_svc__multi_class": ["ovr", "crammer_singer"] # } ] scores = ["precision_micro", "recall_micro", "f1_micro", "accuracy", None] for score in scores: print("# Tuning hyper-parameters for {0}".format(score)) print() grd = GridSearchCV(linear_svc_pipeline, param_grid=linear_svc_params, cv=k_fold, scoring=score) grd.fit(x_train, y_train) print("\nBest score and parameters set found on development set:") print("Score:", grd.best_score_, "Params:", grd.best_params_) print() print("All grid scores on development set:") means = grd.cv_results_["mean_test_score"] stds = grd.cv_results_["std_test_score"] for mean, std, params in zip(means, stds, grd.cv_results_["params"]): print("{0:0.3f} (+/-{1:0.3f}) for {2}".format( mean, std * 2, params)) print() print("Detaiiled classification report:") print("The model is trained on the full development set.") print("The scores are computed on the full evaluation set.") y_true, y_pred = y_test, grd.predict(x_test) print( classification_report( y_true, y_pred, target_names=["negative", "neutral", "positive"])) print() print("Confusion matrix:") print(confusion_matrix(y_true, y_pred)) print() print()
class CommentsAnalyzer(pmlutil.Configurable): def configTypes(self): return dict(amount=int, min_ngram=int, max_ngram=int, min_df=int, max_df=float, use_idf=int, alpha=readArray, l1_ratio=readArray, n_folds=int) def _loadData(self): logging.info("loading data") self.data = [] count = 0 for fn in os.listdir(self._datafolder): if not self._amount < 1 and count >= self._amount: break if fn.endswith(self._metaextension): mfn = self._datafolder + "/" + fn ddm = pml.Datum(mfn,None) if len(ddm.meta()['comments'])>0: self.data.append(ddm) count +=1 logging.info("loaded %d data" % count) def __init__(self): self.data=[] def _aggregateComments(self, subset): allcomments = [] for datum in subset: comments = [] for comment in datum.meta()['comments']: comments.append(comment['text']) allcomments.append(" ".join(comments)) return np.array(allcomments) def _buildDictionary(self, allcomments): print allcomments self.vectorizer = TfidfVectorizer(analyzer=self._analyzer, ngram_range=(self._min_ngram,self._max_ngram), min_df=self._min_df, max_df=self._max_df, norm='l2', smooth_idf=True, use_idf=bool(self._use_idf)) self.vectorizer.fit(allcomments) def run(self): allcomments = self._aggregateComments(self.data) self._buildDictionary(allcomments) # create representation of documents tfidfArray = self.vectorizer.transform(allcomments) # create labelling labels = [] for datum in self.data: labels.append(len(datum.meta()['favorites'])) labels = np.array(labels) print self.vectorizer.get_params() print self.vectorizer.get_feature_names() # training self.elasticNet = ElasticNetCV(alphas=self._alpha, l1_ratio=self._l1_ratio, fit_intercept=True, normalize=False, precompute='auto', max_iter=1000, copy_X=True, tol=0.0001, rho=None, cv=self._n_folds) self.elasticNet.fit(tfidfArray,labels) for i,l1_ratio in enumerate(self._l1_ratio): for j,alpha in enumerate(self._alpha): print "alpha: %f, l1_ratio: %f --> %f" % (alpha,l1_ratio,np.mean(self.elasticNet.mse_path_[i,j,:])) print self.vectorizer.inverse_transform(self.elasticNet.coef_)
class TextRegressor: param_defaults = {'min_df': 1, 'c_ngmin': 1, 'c_ngmax': 1, 'w_ngmax': 1, 'w_ngmin': 1, 'lowercase': 'word', 'alpha': 1.0, 'C': 1.0, 'mix': 1.0} def __init__(self, regressor='ridge', vectorizer='tf-idf'): if regressor == 'ridge': from sklearn.linear_model import Ridge self.reg = Ridge() elif regressor == 'SVR': from sklearn.svm import SVR self.reg = SVR() elif regressor == 'linearsvr': from sklearn.svm import LinearSVR self.reg = LinearSVR() if vectorizer == 'tf-idf': from sklearn.feature_extraction.text import TfidfVectorizer self.vec = TfidfVectorizer() self.vec_params_default = self.vec.get_params() self.reg_params_default = self.reg.get_params() self._reset() def _reset(self): self.par = dict(self.param_defaults) self.vec_params = self.vec_params_default self.vec.set_params(**self.vec_params) self.reg_params = self.reg_params_default self.reg.set_params(**self.reg_params) def set_params(self, **params): self._reset() self.par.update(params) ngram_analyzer = DocAnalyzer( lowercase=self.par.get('lowercase'), c_ngmin=self.par.get('c_ngmin'), c_ngmax=self.par.get('c_ngmax'), w_ngmin=self.par.get('w_ngmin'), w_ngmax=self.par.get('w_ngmax')) self.vec_params.update( {k:self.par[k] for k in self.par.keys() & self.vec_params.keys()}) self.vec.set_params(**self.vec_params) self.vec.set_params(analyzer=ngram_analyzer) self.reg_params.update( {k:self.par[k] for k in self.par.keys() & self.reg_params.keys()}) self.reg.set_params(**self.reg_params) def get_params(self): return self.par def fit(self, text, outcome): num = None if len(text) == 2: text, num = text x = self.vec.fit_transform(text) if num is not None: x = hstack((x, self.par['mix'] * num), format='csr') self.reg.fit(x, outcome) def predict(self, text, gold=None, gold_rank=None, rank_dir=-1, return_score=False): num = None if len(text) == 2: text, num = text x = self.vec.transform(text) if num is not None: x = hstack((x, self.par['mix'] * num), format='csr') pred = self.reg.predict(x) if return_score: return pred, self._score(gold, pred, gold_rank, rank_dir) else: return pred def _score(self, gold, pred, gold_rank=None, rank_dir=-1, verbose=False): r2 = r2_score(gold, pred) rmse = np.sqrt(mean_squared_error(gold, pred)) if gold_rank is None: gold_rank = rankdata(rank_dir * gold, method='ordinal') pred_rank = rankdata(rank_dir * pred, method='ordinal') corr, _ = pearsonr(gold, pred) rank_corr, _ = pearsonr(gold_rank, pred_rank) if verbose: fmt = ("{}: n={}, min={:.4f}, max={:.4f}, mean={:.4f}, " "var={:.4f}, skew={:.4f}, kurtosis={:.4f}") gold_dsc = describe(gold) pred_dsc = describe(pred) print(fmt.format('gold', gold_dsc[0], *gold_dsc[1], *gold_dsc[2:])) print(fmt.format('pred', pred_dsc[0], *pred_dsc[1], *pred_dsc[2:])) return {'r2': r2, 'rmse': rmse, 'rank_corr': rank_corr, 'corr': corr} def score(self, text, gold, gold_rank=None, rank_dir=-1, verbose=False): pred = self.predict(text) return self._score(gold, pred, gold_rank, rank_dir, verbose=verbose)
nltk.download('stopwords') limit = [] for i in range(0, len(stringTotal)): review = re.sub('[^a-zA-Z]', ' ', stringTotal['text'][i]) review = review.lower() review = review.split() review = [ singleStem.stem(word) for word in review if not word in stopwords.words('english') ] review = ' '.join(review) limit.append(review) vector = TfidfVectorizer(max_features=5000, ngram_range=(1, 3)) vector.get_params() xAxis = vector.fit_transform(limit).toarray() yAxis = stringTotal['fact'] X_train, X_test, y_train, y_test = train_test_split(xAxis, yAxis, test_size=0.2, random_state=0) data_count = pd.DataFrame(X_train, columns=vector.get_feature_names()) data_count.head(3) identifier = PassiveAggressiveClassifier(max_iter=1000)
def get_data_only_with_entities(self, relevance_threshold=0.75, gamma=0.89, filter=False): data = self.mongo.get_all(order_by='id_doc') data = [doc for doc in data] only_text = [doc['text'] for doc in data] ent_dict, ent_set = self.get_dandelion_entities(data) if filter: entities_set = set([k for k, v in ent_dict.iteritems()]) else: entities_set = ent_set entities = {e: i for i, e in enumerate(entities_set)} dandelion_entities = np.zeros((len(data), len(entities_set))) for doc in data[:]: text = doc['text'] if 'dandelion' in doc: for e in doc['dandelion']['annotations']: rel = np.float64(e['confidence']) name = e['title'] if rel > relevance_threshold: dandelion_entities[doc['id_doc']][entities[name]] = rel entities_sparse = sparse.csr_matrix(dandelion_entities) tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=200000, min_df=2, stop_words='english', strip_accents='unicode', use_idf=True, ngram_range=(1, 1), norm='l2', tokenizer=TextUtils.tokenize_and_stem) tfidf_matrix = tfidf_vectorizer.fit_transform(only_text) print 'tfifd matrix dimension: %s x %s' %(tfidf_matrix.shape[0], tfidf_matrix.shape[1]) print 'entities matrix dimension: %s x %s ' %(entities_sparse.shape[0], entities_sparse.shape[1]) print 'non zero elements in entities matrix: %s' \ % len(entities_sparse.data) '''print tfidf_matrix[tfidf_matrix > 0].mean() print tfidf_matrix[tfidf_matrix > 0].max() print entities_sparse[entities_sparse > 0].mean() print entities_sparse[entities_sparse > 0].max() print '#' * 80''' #print 'after balancing' tfidf_matrix = tfidf_matrix * 1 entities_sparse = entities_sparse * (1 - gamma) #print tfidf_matrix[tfidf_matrix > 0].mean() #print tfidf_matrix[tfidf_matrix > 0].max() #print entities_sparse[entities_sparse > 0].mean() #print entities_sparse[entities_sparse > 0].max() f_score_dict = self.labels_dict(data) params = tfidf_vectorizer.get_params() params['dandelion_entities'] = entities_sparse.shape[1] params['original_terms'] = tfidf_matrix.shape[0] params['gamma'] = gamma params['relevance_threshold'] = relevance_threshold params['classes'] = len(f_score_dict) params['tokenizer'] = 'TextUtils.tokenize_and_stem' del params['dtype'] params['avg_nnz_row'] = (entities_sparse > 0).sum(1).mean() return sparse.hstack([tfidf_matrix, entities_sparse]), f_score_dict,\ params
class CharTfidfTagger(BaseEstimator): def __init__(self, **kwargs): """ Character-based tfidf sequence tagger. Examples -------- >>> model = CharTfidfTagger() >>> model.fit([["token1", "token2"]], [["A", "B"]]) >>> model.predict([["token1", "token2"]]) [['A', 'B']] """ self.tagger = sklearn_crfsuite.CRF() self.tfidf = TfidfVectorizer(analyzer="char_wb", strip_accents="ascii") self.set_params(**kwargs) def set_params(self, **params): self.tfidf.set_params( **{ k.split("__", 1)[-1]: v for k, v in params.items() if k.startswith("tfidf__") }) self.tagger.set_params( **{ k.split("__", 1)[-1]: v for k, v in params.items() if k.startswith("tagger__") }) def get_params(self, deep=True): params = { "tfidf__" + k: v for k, v in self.tfidf.get_params(deep).items() } params.update({ "tagger__" + k: v for k, v in self.tagger.get_params(deep).items() }) return params def fit(self, X, y): corpus = [" ".join(example) for example in X] self.tfidf.fit(corpus) features = [self.featurize(example) for example in X] self.tagger.fit(features, y) def predict(self, X): features = [self.featurize(example) for example in X] return self.tagger.predict(features) def score(self, X, y): predictions = self.predict(X) return sklearn_crfsuite.metrics.flat_f1_score(y, predictions, average="macro") def featurize(self, tokens) -> List[Dict[str, Any]]: return [ dict( zip( self.tfidf.get_feature_names(), self.tfidf.transform([token]).toarray().reshape(-1), )) for token in tokens ]