Ejemplo n.º 1
0
    def test_get_features_name(self):
        tfidf = Tfidf()
        docs = ['where you from', 'where are you']
        tfidf.fit_model(documents=docs)
        columns = tfidf.get_features_name
        t_columns = ['are', 'from', 'where', 'you']

        assert columns == t_columns
Ejemplo n.º 2
0
    def create_feature(self, data, feature):
        new_data = None
        featuring = st.sidebar.selectbox("How to generate features?",
                                         ['Embedding', 'TfIdf'])
        if featuring == 'Embedding':
            embeddings = self.embedding.predict_one(x=[data[feature].values])
            new_data = pd.DataFrame([[float(x) for x in d]
                                     for d in embeddings])
        elif featuring == "TfIdf":
            grams = st.sidebar.multiselect("Use bigrams? unigrams? or both?",
                                           ['unigrams', 'bigrams'])
            if grams:
                if "unigrams" in grams and "bigrams" in grams:
                    tfidf = Tfidf(bigrams=True, unigrams=True)
                elif "bigrams" in grams:
                    tfidf = Tfidf(bigrams=True, unigrams=False)
                else:
                    tfidf = Tfidf(bigrams=False, unigrams=True)
                tfidf.fit_model(documents=data[feature].values)
                new_data = pd.DataFrame(
                    tfidf.transform(document=data[feature].values))
                new_data.columns = tfidf.get_features_name
            else:
                st.sidebar.warning("Please select the ngrams.")

        return new_data
Ejemplo n.º 3
0
    def test_transform(self):
        tfidf = Tfidf()
        docs = ['where you from', 'where are you']
        tfidf.fit_model(documents=docs)
        data = tfidf.transform(document=docs)
        t_data = np.array([[0., 0.70490949, 0.50154891, 0.50154891],
                           [0.70490949, 0., 0.50154891, 0.50154891]])
        t_data = [[round(x, 3) for x in xx] for xx in t_data]
        data = [[round(x, 3) for x in xx] for xx in data]

        assert data == t_data
Ejemplo n.º 4
0
 def main(self):
     text = st.text_area("Enter the text to normalize here:")
     if text:
         if self.model_name == "MultilangEmbedding":
             data = self.embedding.predict_one(x=text)
             data = [float(x) for x in data[0]]
             st.write(pd.DataFrame([data]))
         else:
             bigrams = st.checkbox('Use bigrams?')
             self.tfidf = Tfidf(bigrams=bigrams)
             self.tfidf.fit_model(documents=[text])
             data = self.tfidf.transform(document=[text])
             cols = self.tfidf.get_features_name
             df = pd.DataFrame(data)
             df.columns = cols
             st.write(df)
Ejemplo n.º 5
0
    def test_end_to_end(self):
        tfidf = Tfidf()
        docs = ['where you from', 'where are you']
        tfidf.fit_model(documents=docs)
        data = tfidf.transform(document=docs)
        columns = tfidf.get_features_name
        data = [[round(x, 3) for x in xx] for xx in data]
        df = pd.DataFrame(data)
        df.columns = columns
        t_data = np.array([[0., 0.70490949, 0.50154891, 0.50154891],
                           [0.70490949, 0., 0.50154891, 0.50154891]])
        t_columns = ['are', 'from', 'where', 'you']
        t_data = [[round(x, 3) for x in xx] for xx in t_data]
        t_df = pd.DataFrame(t_data)
        t_df.columns = t_columns

        assert data == t_data
        assert columns == t_columns
        assert df.equals(t_df)
Ejemplo n.º 6
0
 def __init__(self, model_name):
     self.model_name = model_name
     self.tfidf = Tfidf()
     self.embedding = MUSE()