Exemple #1
0
def preprocess(df, encode, categorize, preran):
    y = df["area"]
    X = df.drop(
        ["area"],
        axis=1,
    )

    X.info()

    if encode:
        encode_columns = []
        n_prototypes = 5
        if not preran:
            enc = SimilarityEncoder(similarity="ngram",
                                    categories="k-means",
                                    n_prototypes=n_prototypes)
            enc.fit(X[encode_columns].values)
            pd.to_pickle(enc, "encoders/similarity_encoder.pickle")
        else:
            enc = pd.read_pickle("encoders/similarity_encoder.pickle")
        transformed_values = enc.transform(X[encode_columns].values)

        transformed_values = pd.DataFrame(transformed_values, index=X.index)
        transformed_columns = []
        for col in encode_columns:
            for i in range(0, n_prototypes):
                transformed_columns.append(col + "_" + str(i))
        transformed_values.columns = transformed_columns
        X = pd.concat([X, transformed_values], axis=1)
        X = X.drop(encode_columns, axis=1)

    if categorize:
        obj_cols = X.select_dtypes("object").columns
        X[obj_cols] = X[obj_cols].astype("category")
    return X, y
class FitSimilarityEncoder:
    def __init__(self, col_name):
        self.col_name = col_name

    def fit(self,
            df,
            similarity="ngram",
            categories="most_frequent",
            n_prototypes=100):
        # Initialaze the similarity encoder
        self.similarity_encoder = SimilarityEncoder(similarity=similarity,
                                                    dtype=np.float32,
                                                    categories=categories,
                                                    n_prototypes=n_prototypes,
                                                    random_state=1006)

        # Fit the similarity encoder
        self.similarity_encoder.fit(df[self.col_name].values.reshape(-1, 1))

    def transform(self, df):
        return self.similarity_encoder.transform(
            df[self.col_name].values.reshape(-1, 1))
def similarity_encode(X, encode_columns, n_prototypes, train, drop_original):
    X = X.copy()
    if train:
        enc = SimilarityEncoder(similarity="ngram",
                                categories="k-means",
                                n_prototypes=n_prototypes)
        enc.fit(X[encode_columns].values)
        Path("encoders").mkdir(exist_ok=True)
        pd.to_pickle(enc, "encoders/similarity_encoder.pickle")
    else:
        enc = pd.read_pickle("encoders/similarity_encoder.pickle")
    transformed_values = enc.transform(X[encode_columns].values)

    transformed_values = pd.DataFrame(transformed_values, index=X.index)
    transformed_columns = []
    for col in encode_columns:
        for i in range(0, n_prototypes):
            transformed_columns.append(col + "_" + str(i))
    transformed_values.columns = transformed_columns
    X = pd.concat([X, transformed_values], axis=1)
    if drop_original:
        X = X.drop(encode_columns, axis=1)
    return X
Exemple #4
0
def categorical_encoding(A, B, y_train, encoder, clf_type, n_jobs):
    '''Build the matrix of encoders.
    Given two arrays of strings to compare an a encoder, returns the
    corresponding encoder matrix of size len(A)xlen(B)'''

    if encoder == 'levenshtein-ratio_SimilarityEncoder':
        B = np.unique(B).reshape(-1, 1)
        encoder = SimilarityEncoder(similarity='levenshtein-ratio')
        encoder.fit(B)
        se = encoder.transform(A.reshape(-1, 1))
        return se
    if encoder == 'one-hot_encoding':
        return one_hot_encoding(A, B)
    if encoder == 'one-hot_encoding_sparse':
        return sparse.csr_matrix(one_hot_encoding_sparse(A, B))
    if encoder == 'jaccard_similarity':
        B = np.unique(B)
        warning = (('Warning: %s is not a well defined similarity ' +
                    'metric because two different values can have a ' +
                    'similarity of 1') % encoder)
        print(warning)
        unqA = np.unique(A)
        vlev = np.vectorize(dist.jaccard)
        # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1))
        #                           for a in unqA)
        dvec = [vlev(a, B.reshape(1, -1)) for a in unqA]
        ddict = {unqA[i]: dvec[i] for i in range(len(dvec))}
        dms = (ddict[a] for a in A)
        dm = np.vstack(dms)
        return 1 - dm
    if encoder == 'sorensen_similarity':
        B = np.unique(B)
        unqA = np.unique(A)
        vlev = np.vectorize(dist.sorensen)
        # dvec = Parallel(n_jobs=n_jobs)(delayed(vlev)(a, B.reshape(1, -1))
        #                           for a in unqA)
        dvec = [vlev(a, B.reshape(1, -1)) for a in unqA]
        ddict = {unqA[i]: dvec[i] for i in range(len(dvec))}
        dms = (ddict[a] for a in A)
        dm = np.vstack(dms)
        return 1 - dm
    if encoder == 'jaro-winkler_SimilarityEncoder':
        B = np.unique(B).reshape(-1, 1)
        encoder = SimilarityEncoder(similarity='jaro-winkler')
        encoder.fit(B)
        se = encoder.transform(A.reshape(-1, 1))
        return se
    if encoder[1:] == 'gram_SimilarityEncoder':
        n = int(encoder[0])
        B = np.unique(B).reshape(-1, 1)
        encoder = SimilarityEncoder()
        encoder.fit(B)
        return encoder.transform(A.reshape(-1, 1))
    if encoder[1:] == 'gram_similarity2':
        n = int(encoder[0])
        B = np.unique(B)
        return ngram_similarity(A, B, n, sim_type='sim2')
    if encoder[1:] == 'gram_presence_fisher_kernel':
        n = int(encoder[0])
        return ngram_similarity(A, B, n, sim_type='fisher_kernel')
    if encoder[1:] == 'gram_similarity2_1':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim2_1')
        return sm
    if encoder[1:] == 'gram_similarity2_2':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim2_2')
        return sm
    if encoder[1:] == 'gram_similarity3':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim3')
        return sm
    if encoder[1:] == 'gram_similarity3_2':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim3_2')
        return sm
    if encoder[1:] == 'gram_similarity4':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim4')
        return sm
    if encoder[1:] == 'gram_similarity5':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim5')
        return sm
    if encoder[1:] == 'gram_similarity6':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim6')
        return sm
    if encoder[1:] == 'gram_similarity7':
        n = int(encoder[0])
        B = np.unique(B)
        sm = ngram_similarity(A, B, n, sim_type='sim7')
        return sm
    if encoder[1:] == 'grams_count_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n))
        vectorizer.fit(B)
        return vectorizer.transform(A)
    if encoder[1:] == 'grams_tfidf_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = TfidfVectorizer(analyzer='char',
                                     ngram_range=(n, n),
                                     smooth_idf=False)
        vectorizer.fit(B)
        return vectorizer.transform(A)
    if encoder[1:] == 'grams_tf_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = TfidfVectorizer(analyzer='char',
                                     ngram_range=(n, n),
                                     smooth_idf=False,
                                     use_idf=False)
        vectorizer.fit(B)
        return vectorizer.transform(A)
    if encoder[1:] == 'grams_hot_vectorizer':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n))
        vectorizer.fit(B)
        count_matrix1 = vectorizer.transform(A)
        return (count_matrix1 > 0).astype('float64')
    if encoder[1:] == 'grams_hot_vectorizer_tfidf':
        n = int(encoder[0])
        B = np.unique(B)
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(n, n))
        presenceB = (vectorizer.fit_transform(B) > 0).astype('float64')
        presenceA = (vectorizer.transform(A) > 0).astype('float64')
        transformer = TfidfTransformer(smooth_idf=True)
        transformer.fit(presenceB)
        tfidfA = transformer.transform(presenceA)
        return tfidfA
    if encoder[1:] == 'grams_hashing':
        n = int(encoder[0])
        hashingA = ngrams_hashing_vectorizer(A, n, 10000)
        return hashingA
    if encoder == 'TargetEncoder':
        encoder = TargetEncoder(clf_type=clf_type, handle_unknown='ignore')
        encoder.fit(B.reshape(-1, 1), y_train)
        return encoder.transform(A.reshape(-1, 1))
    if encoder == 'MDVEncoder':
        return mdv_encoding(A, B, y_train, clf_type)
    if encoder == 'BackwardDifferenceEncoder':
        encoder = ce.BackwardDifferenceEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'BinaryEncoder':
        encoder = ce.BinaryEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'HashingEncoder':
        encoder = ce.HashingEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'HelmertEncoder':
        encoder = ce.HelmertEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'OneHotEncoder':
        encoder = ce.OneHotEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'OrdinalEncoder':
        encoder = ce.OrdinalEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'SumEncoder':
        encoder = ce.SumEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'PolynomialEncoder':
        encoder = ce.PolynomialEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'BaseNEncoder':
        encoder = ce.BaseNEncoder()
        encoder.fit(B)
        return encoder.transform(A)
    if encoder == 'LeaveOneOutEncoder':
        encoder = ce.LeaveOneOutEncoder()
        encoder.fit(B, y_train)
        return encoder.transform(A)
    else:
        message = 'Encoder %s has not been implemented yet.' % encoder
        return message