def tf_to_tfidf(docs_tf,
                sublinear_tf=False,
                smooth_idf=False,
                use_idf=True,
                norm='l2',
                idf_diag=None):
    """Transform TF matrix into TFIDF matrix

    Return value:
        A tuple which contains:
            docsTFIDF - The TFIDF scores in coo_matrix format
            idf_diag - The idf information produced by TfidfTransformer

    Use the function from sklearn.feature_extraction library
    """
    tfidf_transformer = TfidfTransformer(sublinear_tf=sublinear_tf,
                                         smooth_idf=smooth_idf,
                                         use_idf=use_idf,
                                         norm=norm)
    if idf_diag is not None:
        tfidf_transformer._idf_diag = idf_diag
        docsTFIDF = tfidf_transformer.transform(docs_tf)
    else:
        tfidf_transformer._idf_diag = None
        docsTFIDF = tfidf_transformer.fit_transform(docs_tf)
    return (docsTFIDF, tfidf_transformer._idf_diag)
Example #2
0
    def compute_vectors(self,
                        count_vectors,
                        min_df,
                        svd=False,
                        n_components=0):
        if min_df > 0:
            mask = self.df > min_df
            df = self.df[mask]
            count_vectors = count_vectors[:, mask]
        else:
            df = self.df
        self.n_samples += count_vectors.shape[0]
        # logging.info("Min_df reduces nb of features, new count matrix shape: {}".format(
        #     count_vectors.shape)
        # )
        # compute smoothed idf
        idf = np.log((self.n_samples + 1) / (df + 1)) + 1
        transformer = TfidfTransformer()
        transformer._idf_diag = sparse.diags(idf,
                                             offsets=0,
                                             shape=(len(df), len(df)),
                                             format="csr",
                                             dtype=df.dtype)
        X = transformer.transform(count_vectors)
        # equivalent to:
        # X = normalize(X * transformer._idf_diag, norm='l2', copy=False)
        if svd:
            logging.info("Performing dimensionality reduction using LSA")
            svd = TruncatedSVD(n_components=n_components, random_state=42)
            normalizer = Normalizer(copy=False)
            lsa = make_pipeline(svd, normalizer)
            X = lsa.fit_transform(X)
            logging.info("New shape: {}".format(X.shape))

        return X
Example #3
0
def _deserialize_tfidf_vectorizer(vectorizer_dict, language, sublinear_tf):
    tfidf_vectorizer = _get_tfidf_vectorizer(language, sublinear_tf)
    tfidf_transformer = TfidfTransformer()
    vocab = vectorizer_dict["vocab"]
    if vocab is not None:  # If the vectorizer has been fitted
        tfidf_vectorizer.vocabulary_ = vocab
        idf_diag_data = np.array(vectorizer_dict["idf_diag"])
        idf_diag_shape = (len(idf_diag_data), len(idf_diag_data))
        row = list(range(idf_diag_shape[0]))
        col = list(range(idf_diag_shape[0]))
        idf_diag = sp.csr_matrix((idf_diag_data, (row, col)),
                                 shape=idf_diag_shape)
        tfidf_transformer._idf_diag = idf_diag  # pylint: disable=W0212
    tfidf_vectorizer._tfidf = tfidf_transformer  # pylint: disable=W0212
    return tfidf_vectorizer
Example #4
0
def _deserialize_tfidf_vectorizer(vectorizer_dict, language, sublinear_tf):
    tfidf_vectorizer = _get_tfidf_vectorizer(language, sublinear_tf)
    tfidf_transformer = TfidfTransformer()
    vocab = vectorizer_dict["vocab"]
    if vocab is not None:  # If the vectorizer has been fitted
        tfidf_vectorizer.vocabulary_ = vocab
        idf_diag_data = np.array(vectorizer_dict["idf_diag"])
        idf_diag_shape = (len(idf_diag_data), len(idf_diag_data))
        row = list(range(idf_diag_shape[0]))
        col = list(range(idf_diag_shape[0]))
        idf_diag = sp.csr_matrix((idf_diag_data, (row, col)),
                                 shape=idf_diag_shape)
        tfidf_transformer._idf_diag = idf_diag  # pylint: disable=W0212
    tfidf_vectorizer._tfidf = tfidf_transformer  # pylint: disable=W0212
    return tfidf_vectorizer
    def from_path(cls, path, **shared):
        import numpy as np
        import scipy.sparse as sp
        from sklearn.feature_extraction.text import (TfidfTransformer,
                                                     TfidfVectorizer as
                                                     SklearnTfidfVectorizer)

        path = Path(path)

        model_path = path / "vectorizer.json"
        if not model_path.exists():
            raise LoadingError("Missing vectorizer model file: %s" %
                               model_path.name)
        with model_path.open("r", encoding="utf-8") as f:
            vectorizer_dict = json.load(f)

        vectorizer = cls(vectorizer_dict["config"], **shared)
        vectorizer._language = vectorizer_dict["language_code"]

        builtin_entity_scope = vectorizer_dict["builtin_entity_scope"]
        if builtin_entity_scope is not None:
            builtin_entity_scope = set(builtin_entity_scope)
        vectorizer.builtin_entity_scope = builtin_entity_scope

        vectorizer_ = vectorizer_dict["vectorizer"]
        if vectorizer_:
            vocab = vectorizer_["vocab"]
            idf_diag_data = vectorizer_["idf_diag"]
            idf_diag_data = np.array(idf_diag_data)

            idf_diag_shape = (len(idf_diag_data), len(idf_diag_data))
            row = list(range(idf_diag_shape[0]))
            col = list(range(idf_diag_shape[0]))
            idf_diag = sp.csr_matrix((idf_diag_data, (row, col)),
                                     shape=idf_diag_shape)

            tfidf_transformer = TfidfTransformer()
            tfidf_transformer._idf_diag = idf_diag

            vectorizer_ = SklearnTfidfVectorizer(
                tokenizer=lambda x: tokenize_light(x, vectorizer._language))
            vectorizer_.vocabulary_ = vocab

            vectorizer_._tfidf = tfidf_transformer

        vectorizer._tfidf_vectorizer = vectorizer_
        return vectorizer
def reduce_feature_space(normed_TFIDF_features, a_value, rating_list, fitted_transformer, vocabulary):
    """
    Reduce the number of features that will be used to train the model.
    imput arguments:
        normed_TFIDF_features: a sparse csr matrix containing the normalized TF-IDF features extracted from the reviews
        a_value: a hyperparameter to tune, see report for more information
        rating_list: a list ratings corresponding to the reviews ( rows of the feature matrix)
        fitted_transformer: a TfidfTransformer object, to extract IDF values from
        vocabulary: a dictionary containing the used words and their feature index
    output arguments:
        reduced_normed_TFIDF_features: a sparse csr matrix containing the reduction of thenormalized TF-IDF features
        reduced_vocabulary: a dictionary containing the reduced set of words and their feature index
        new_fitted_transformer: a TfidfTransformer object, from which some IDF-values have been removed
    """

    # Initialize the sigma priority queues
    sigma_values = {0:PriorityQueue(),1:PriorityQueue(),2:PriorityQueue(),3:PriorityQueue(),4:PriorityQueue(),5:PriorityQueue()}

    # Set the reduction level, this is a hyperparameter, but is not tuned in the first phase, a value of 10% is taken from literature
    reduction_level = 0.10

    # The number of features is the number of columns in the feature matrix
    all_features = normed_TFIDF_features.shape[1]

    # Keep "reduction_level*100"% of all features
    keep_features = int(math.ceil(float(all_features)*reduction_level))

    # Number of features that may be chosen per rating
    features_per_rating = int(math.ceil(float(keep_features)/6))

    # Get IDF values from the transformer
    IDFs = fitted_transformer._idf_diag.diagonal()

    # Loop over all features

    # Use a counter to visualize the progress
    count = 1.0
    prev = 0.0
    epsilon = 0.01

    # Transform matrix to csc to perform more effectively with column operations
    normed_TFIDF_features = normed_TFIDF_features.tocsc()

    # Loop over all features
    for i in range(0, all_features):
        feature = np.ndarray.flatten(normed_TFIDF_features.getcol(i).toarray())
        rating_list = numpy.array(rating_list)
        
        # Compute sigma value as described in literature
        sigma_set = rating_list[feature==0]
        mean_rating = int(math.round(sigma_set.mean()))
        
        # Do not invert this value, as is done in literature, this makes using heapq easier
        sigma_value = (sigma_set.var()+epsilon)*((IDFs[i])**a_value)
        
        # Store feature index at the rating with nearest mean
        sigma_values[mean_rating].put(i, sigma_value)
        
        # Print "counter" for visualizing progress
        if count/all_features > prev:
            print strftime("%H:%M:%S") +' : ' + str(math.ceil(100*(count/all_features*100))/100) + '% of the features checked for sigma values.'
            prev += 0.01
        count +=1.0

    # Selection of the most informative features
    features_to_keep = []
    
    # Transform back into csr format (to delete feature cols)
    normed_TFIDF_features = normed_TFIDF_features.tocsr()

    # Let every rating class choose a feature in a round robin fashion
    for a in range(0, 6*features_per_rating):
        if not sigma_values[a%6].empty():
            features_to_keep.append(sigma_values[a%6].get())

    # Remove all features that will not be used
    remove = list_diff(range(0, all_features), features_to_keep)
    reduced_normed_TFIDF_features = drop_cols( normed_TFIDF_features,remove)

    # Remove all words from the vocabulary that won't be used, adept indices
    reduced_vocabulary_temp = {k:v for (k,v) in vocabulary.items() if v in features_to_keep}
    sorted_voc = sorted(reduced_vocabulary_temp.items(), key=lambda x: x[1])
    reduced_vocabulary = {}
    for z in range(0, len(reduced_vocabulary_temp)):
        reduced_vocabulary[sorted_voc[z][0]]=z

    new_fitted_transformer = TfidfTransformer(smooth_idf=False)
    new_fitted_transformer._idf_diag = fitted_transformer._idf_diag.copy()
    
    # Remove unnecessary IDF values from the IDF matrix of the fitted transform
    new_fitted_transformer._idf_diag = drop_cols(new_fitted_transformer._idf_diag, remove)
    new_fitted_transformer._idf_diag = drop_rows(new_fitted_transformer._idf_diag, remove)

    return reduced_normed_TFIDF_features, reduced_vocabulary, new_fitted_transformer
    inputData = inputEvent['vector']

### Model input

modelDirBase = os.getenv('DATA_DIR', '/model')
modelId = os.getenv('MODEL_ID')
modelDir = modelDirBase + "/" + modelId
model = joblib.load(modelDir + '/model.pkl')

docs_new = [inputData]

newDocsNormalized = docs_new
if (isTextData):
    from sklearn.feature_extraction.text import CountVectorizer
    countVectorizer = CountVectorizer()
    countVectorizer.vocabulary_ = joblib.load(modelDir + '/vocabulary.pkl')

    from sklearn.feature_extraction.text import TfidfTransformer
    tfidf_transformer = TfidfTransformer()
    tfidf_transformer._idf_diag = joblib.load(modelDir + '/tfidf.pkl')

    X_new_counts = countVectorizer.transform(docs_new)
    newDocsNormalized = tfidf_transformer.transform(X_new_counts)

predicted = model.predict(newDocsNormalized)
print docs_new
response = {}
response['response'] = predicted[0]
open(baseDataDir + '/response.json', 'w').write(json.dumps(response))

print 'KPIPES:SUCCESS'