def tf_to_tfidf(docs_tf, sublinear_tf=False, smooth_idf=False, use_idf=True, norm='l2', idf_diag=None): """Transform TF matrix into TFIDF matrix Return value: A tuple which contains: docsTFIDF - The TFIDF scores in coo_matrix format idf_diag - The idf information produced by TfidfTransformer Use the function from sklearn.feature_extraction library """ tfidf_transformer = TfidfTransformer(sublinear_tf=sublinear_tf, smooth_idf=smooth_idf, use_idf=use_idf, norm=norm) if idf_diag is not None: tfidf_transformer._idf_diag = idf_diag docsTFIDF = tfidf_transformer.transform(docs_tf) else: tfidf_transformer._idf_diag = None docsTFIDF = tfidf_transformer.fit_transform(docs_tf) return (docsTFIDF, tfidf_transformer._idf_diag)
def compute_vectors(self, count_vectors, min_df, svd=False, n_components=0): if min_df > 0: mask = self.df > min_df df = self.df[mask] count_vectors = count_vectors[:, mask] else: df = self.df self.n_samples += count_vectors.shape[0] # logging.info("Min_df reduces nb of features, new count matrix shape: {}".format( # count_vectors.shape) # ) # compute smoothed idf idf = np.log((self.n_samples + 1) / (df + 1)) + 1 transformer = TfidfTransformer() transformer._idf_diag = sparse.diags(idf, offsets=0, shape=(len(df), len(df)), format="csr", dtype=df.dtype) X = transformer.transform(count_vectors) # equivalent to: # X = normalize(X * transformer._idf_diag, norm='l2', copy=False) if svd: logging.info("Performing dimensionality reduction using LSA") svd = TruncatedSVD(n_components=n_components, random_state=42) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) X = lsa.fit_transform(X) logging.info("New shape: {}".format(X.shape)) return X
def _deserialize_tfidf_vectorizer(vectorizer_dict, language, sublinear_tf): tfidf_vectorizer = _get_tfidf_vectorizer(language, sublinear_tf) tfidf_transformer = TfidfTransformer() vocab = vectorizer_dict["vocab"] if vocab is not None: # If the vectorizer has been fitted tfidf_vectorizer.vocabulary_ = vocab idf_diag_data = np.array(vectorizer_dict["idf_diag"]) idf_diag_shape = (len(idf_diag_data), len(idf_diag_data)) row = list(range(idf_diag_shape[0])) col = list(range(idf_diag_shape[0])) idf_diag = sp.csr_matrix((idf_diag_data, (row, col)), shape=idf_diag_shape) tfidf_transformer._idf_diag = idf_diag # pylint: disable=W0212 tfidf_vectorizer._tfidf = tfidf_transformer # pylint: disable=W0212 return tfidf_vectorizer
def _deserialize_tfidf_vectorizer(vectorizer_dict, language, sublinear_tf): tfidf_vectorizer = _get_tfidf_vectorizer(language, sublinear_tf) tfidf_transformer = TfidfTransformer() vocab = vectorizer_dict["vocab"] if vocab is not None: # If the vectorizer has been fitted tfidf_vectorizer.vocabulary_ = vocab idf_diag_data = np.array(vectorizer_dict["idf_diag"]) idf_diag_shape = (len(idf_diag_data), len(idf_diag_data)) row = list(range(idf_diag_shape[0])) col = list(range(idf_diag_shape[0])) idf_diag = sp.csr_matrix((idf_diag_data, (row, col)), shape=idf_diag_shape) tfidf_transformer._idf_diag = idf_diag # pylint: disable=W0212 tfidf_vectorizer._tfidf = tfidf_transformer # pylint: disable=W0212 return tfidf_vectorizer
def from_path(cls, path, **shared): import numpy as np import scipy.sparse as sp from sklearn.feature_extraction.text import (TfidfTransformer, TfidfVectorizer as SklearnTfidfVectorizer) path = Path(path) model_path = path / "vectorizer.json" if not model_path.exists(): raise LoadingError("Missing vectorizer model file: %s" % model_path.name) with model_path.open("r", encoding="utf-8") as f: vectorizer_dict = json.load(f) vectorizer = cls(vectorizer_dict["config"], **shared) vectorizer._language = vectorizer_dict["language_code"] builtin_entity_scope = vectorizer_dict["builtin_entity_scope"] if builtin_entity_scope is not None: builtin_entity_scope = set(builtin_entity_scope) vectorizer.builtin_entity_scope = builtin_entity_scope vectorizer_ = vectorizer_dict["vectorizer"] if vectorizer_: vocab = vectorizer_["vocab"] idf_diag_data = vectorizer_["idf_diag"] idf_diag_data = np.array(idf_diag_data) idf_diag_shape = (len(idf_diag_data), len(idf_diag_data)) row = list(range(idf_diag_shape[0])) col = list(range(idf_diag_shape[0])) idf_diag = sp.csr_matrix((idf_diag_data, (row, col)), shape=idf_diag_shape) tfidf_transformer = TfidfTransformer() tfidf_transformer._idf_diag = idf_diag vectorizer_ = SklearnTfidfVectorizer( tokenizer=lambda x: tokenize_light(x, vectorizer._language)) vectorizer_.vocabulary_ = vocab vectorizer_._tfidf = tfidf_transformer vectorizer._tfidf_vectorizer = vectorizer_ return vectorizer
def reduce_feature_space(normed_TFIDF_features, a_value, rating_list, fitted_transformer, vocabulary): """ Reduce the number of features that will be used to train the model. imput arguments: normed_TFIDF_features: a sparse csr matrix containing the normalized TF-IDF features extracted from the reviews a_value: a hyperparameter to tune, see report for more information rating_list: a list ratings corresponding to the reviews ( rows of the feature matrix) fitted_transformer: a TfidfTransformer object, to extract IDF values from vocabulary: a dictionary containing the used words and their feature index output arguments: reduced_normed_TFIDF_features: a sparse csr matrix containing the reduction of thenormalized TF-IDF features reduced_vocabulary: a dictionary containing the reduced set of words and their feature index new_fitted_transformer: a TfidfTransformer object, from which some IDF-values have been removed """ # Initialize the sigma priority queues sigma_values = {0:PriorityQueue(),1:PriorityQueue(),2:PriorityQueue(),3:PriorityQueue(),4:PriorityQueue(),5:PriorityQueue()} # Set the reduction level, this is a hyperparameter, but is not tuned in the first phase, a value of 10% is taken from literature reduction_level = 0.10 # The number of features is the number of columns in the feature matrix all_features = normed_TFIDF_features.shape[1] # Keep "reduction_level*100"% of all features keep_features = int(math.ceil(float(all_features)*reduction_level)) # Number of features that may be chosen per rating features_per_rating = int(math.ceil(float(keep_features)/6)) # Get IDF values from the transformer IDFs = fitted_transformer._idf_diag.diagonal() # Loop over all features # Use a counter to visualize the progress count = 1.0 prev = 0.0 epsilon = 0.01 # Transform matrix to csc to perform more effectively with column operations normed_TFIDF_features = normed_TFIDF_features.tocsc() # Loop over all features for i in range(0, all_features): feature = np.ndarray.flatten(normed_TFIDF_features.getcol(i).toarray()) rating_list = numpy.array(rating_list) # Compute sigma value as described in literature sigma_set = rating_list[feature==0] mean_rating = int(math.round(sigma_set.mean())) # Do not invert this value, as is done in literature, this makes using heapq easier sigma_value = (sigma_set.var()+epsilon)*((IDFs[i])**a_value) # Store feature index at the rating with nearest mean sigma_values[mean_rating].put(i, sigma_value) # Print "counter" for visualizing progress if count/all_features > prev: print strftime("%H:%M:%S") +' : ' + str(math.ceil(100*(count/all_features*100))/100) + '% of the features checked for sigma values.' prev += 0.01 count +=1.0 # Selection of the most informative features features_to_keep = [] # Transform back into csr format (to delete feature cols) normed_TFIDF_features = normed_TFIDF_features.tocsr() # Let every rating class choose a feature in a round robin fashion for a in range(0, 6*features_per_rating): if not sigma_values[a%6].empty(): features_to_keep.append(sigma_values[a%6].get()) # Remove all features that will not be used remove = list_diff(range(0, all_features), features_to_keep) reduced_normed_TFIDF_features = drop_cols( normed_TFIDF_features,remove) # Remove all words from the vocabulary that won't be used, adept indices reduced_vocabulary_temp = {k:v for (k,v) in vocabulary.items() if v in features_to_keep} sorted_voc = sorted(reduced_vocabulary_temp.items(), key=lambda x: x[1]) reduced_vocabulary = {} for z in range(0, len(reduced_vocabulary_temp)): reduced_vocabulary[sorted_voc[z][0]]=z new_fitted_transformer = TfidfTransformer(smooth_idf=False) new_fitted_transformer._idf_diag = fitted_transformer._idf_diag.copy() # Remove unnecessary IDF values from the IDF matrix of the fitted transform new_fitted_transformer._idf_diag = drop_cols(new_fitted_transformer._idf_diag, remove) new_fitted_transformer._idf_diag = drop_rows(new_fitted_transformer._idf_diag, remove) return reduced_normed_TFIDF_features, reduced_vocabulary, new_fitted_transformer
inputData = inputEvent['vector'] ### Model input modelDirBase = os.getenv('DATA_DIR', '/model') modelId = os.getenv('MODEL_ID') modelDir = modelDirBase + "/" + modelId model = joblib.load(modelDir + '/model.pkl') docs_new = [inputData] newDocsNormalized = docs_new if (isTextData): from sklearn.feature_extraction.text import CountVectorizer countVectorizer = CountVectorizer() countVectorizer.vocabulary_ = joblib.load(modelDir + '/vocabulary.pkl') from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() tfidf_transformer._idf_diag = joblib.load(modelDir + '/tfidf.pkl') X_new_counts = countVectorizer.transform(docs_new) newDocsNormalized = tfidf_transformer.transform(X_new_counts) predicted = model.predict(newDocsNormalized) print docs_new response = {} response['response'] = predicted[0] open(baseDataDir + '/response.json', 'w').write(json.dumps(response)) print 'KPIPES:SUCCESS'