def test_transform(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        self.assertIsInstance(result, Corpus)
        self.assertEqual(len(result.domain), 43)
    def test_transform(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        self.assertIsInstance(result, Corpus)
        self.assertEqual(len(result.domain.variables), 43)
    def test_compute_values(self):
        corpus = Corpus.from_file('deerwester')
        vect = BowVectorizer()

        bow = vect.transform(corpus)
        computed = Corpus.from_table(bow.domain, corpus)

        self.assertEqual(bow.domain, computed.domain)
        self.assertEqual((bow.X != computed.X).nnz, 0)
 def test_empty_corpus(self):
     """
     Empty data.
     GH-247
     """
     corpus = Corpus.from_file("deerwester")[:0]
     vect = BowVectorizer(norm=BowVectorizer.L1)
     out = vect.transform(corpus)
     self.assertEqual(out, corpus)
    def test_compute_values(self):
        corpus = Corpus.from_file('deerwester')
        vect = BowVectorizer()

        bow = vect.transform(corpus)
        computed = Corpus.from_table(bow.domain, corpus)

        self.assertEqual(bow.domain, computed.domain)
        self.assertEqual((bow.X != computed.X).nnz, 0)
    def test_callback(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file("deerwester")
        callback = MagicMock()

        result = vect.transform(corpus, callback=callback)
        self.assertIsInstance(result, Corpus)
        self.assertEqual(len(result.domain.variables), 43)
        callback.assert_has_calls([call(0.3), call(0.6), call(0.9), call(1)])
 def test_empty_corpus(self):
     """
     Empty data.
     GH-247
     """
     corpus = Corpus.from_file("deerwester")[:0]
     vect = BowVectorizer(norm=BowVectorizer.L1)
     out = vect.transform(corpus)
     self.assertEqual(out, corpus)
 def test_ngrams(self):
     vect = BowVectorizer()
     corpus = Corpus.from_file('deerwester')
     corpus = preprocess.RegexpTokenizer('\w+')(corpus)
     corpus = preprocess.NGrams(ngrams_range=(1, 3))(corpus)
     result = vect.transform(corpus)
     attrs = [attr.name for attr in result.domain.attributes]
     self.assertIn(corpus.tokens[0][1], attrs)
     self.assertIn(' '.join(corpus.tokens[0][:2]), attrs)
     self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
 def test_ngrams(self):
     vect = BowVectorizer()
     corpus = Corpus.from_file('deerwester')
     pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'),
                                  ngrams_range=(1, 3))
     pr(corpus, inplace=True)
     result = vect.transform(corpus)
     attrs = [attr.name for attr in result.domain.attributes]
     self.assertIn(corpus.tokens[0][1], attrs)
     self.assertIn(' '.join(corpus.tokens[0][:2]), attrs)
     self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
Exemple #10
0
def main():

    corpus = Corpus.from_file('book-excerpts')
    vect = BowVectorizer()
    corpus_vect = vect.transform(corpus)
    app = QApplication([])
    widget = OWWordEnrichment()
    widget.set_data(corpus_vect)
    subset_corpus = corpus_vect[:10]
    widget.set_data_selected(subset_corpus)
    widget.handleNewSignals()
    widget.show()
    app.exec()
    def test_domain(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        attrs = [attr.name for attr in result.domain.attributes]
        self.assertEqual(attrs, sorted(attrs))

        X = result.X.toarray()
        for i in range(len(corpus)):
            for contains, attr in zip(X[i], attrs):
                if contains > .001:
                    self.assertIn(attr, corpus.tokens[i])
    def test_domain(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        attrs = [attr.name for attr in result.domain.attributes]
        self.assertEqual(attrs, sorted(attrs))

        X = result.X.toarray()
        for i in range(len(corpus)):
            for contains, attr in zip(X[i], attrs):
                if contains > .001:
                    self.assertIn(attr, corpus.tokens[i])
 def tests_duplicated_names(self):
     """
     BOW adds words to the domain and if same attribute name already appear
     in the domain it renames it and add number to the existing attribute
     name
     """
     corpus = Corpus.from_file("deerwester")
     corpus = corpus.extend_attributes(np.ones((len(corpus), 1)), ["human"])
     corpus = corpus.extend_attributes(np.ones((len(corpus), 1)), ["testtest"])
     vect = BowVectorizer()
     out = vect.transform(corpus)
     # first attribute is in the dataset before bow and should be renamed
     self.assertEqual("human (1)", out.domain[0].name)
     self.assertEqual("testtest", out.domain[1].name)
     # all attributes from [1:] are are bow attributes and should include
     # human
     self.assertIn("human", [v.name for v in out.domain.attributes[1:]])
    def test_count_correctness(self):
        """Test if computed counts are correct for train and test dataset"""
        bow = BowVectorizer().transform(self.small_corpus_train)
        self.assert_bow_same(bow, self.train_counts, self.terms)

        # computed from compute_values - result contains only terms from train dataset
        bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
        self.assert_bow_same(bow_test, self.test_counts, self.terms)
    def test_compute_values_same_tfidf_regardless_num_documents(self):
        """
        When computing TF-IDF from compute values TF-IDF should give same
        results regardless of length of new corpus - IDF weighting should consider
        only counts from original corpus.
        """
        corpus = Corpus.from_file('deerwester')
        train_corpus = corpus[:5]
        test_corpus = corpus[5:]
        vect = BowVectorizer(wglobal=BowVectorizer.IDF)

        bow = vect.transform(train_corpus)
        computed1 = Corpus.from_table(bow.domain, test_corpus[1:])
        computed2 = Corpus.from_table(bow.domain, test_corpus)

        self.assertEqual(computed1.domain, computed2.domain)
        self.assertEqual(bow.domain, computed2.domain)
        self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0)
Exemple #16
0
def tfidf_keywords(
        corpus: Corpus,
        progress_callback: Callable = None) -> List[List[Tuple[str, float]]]:
    """
    Extract keywords using TF-IDF.

    Parameters
    ----------
    tokens : list
        Lists of tokens.
    progress_callback : callable
        Function for reporting progress.

    Returns
    -------
    keywords : list
    """
    if progress_callback is None:
        progress_callback = dummy_callback

    # empty X part - to know that every feature of X is bag of wrds
    domain = Domain([],
                    class_vars=corpus.domain.class_vars,
                    metas=corpus.domain.metas)
    corpus = corpus.from_table(domain, corpus)

    vectorizer = BowVectorizer(
        wlocal=BowVectorizer.COUNT,
        wglobal=BowVectorizer.IDF if len(corpus) > 1 else BowVectorizer.NONE,
        norm=BowVectorizer.L2,
    )
    res = vectorizer.transform(corpus)
    X, words = res.X, [a.name for a in res.domain.attributes]

    keywords = []
    n_docs = X.shape[0]
    for i, row in enumerate(X):
        progress_callback(i / n_docs)
        nonzero = row.nonzero()
        if len(nonzero) > 1:
            keywords.append([(words[i], row[0, i]) for i in nonzero[1]])
        else:
            keywords.append([])
    return keywords
    def test_compute_values_to_different_domain(self):
        source = Corpus.from_file('deerwester')
        destination = Corpus.from_file('book-excerpts')

        self.assertFalse(source.domain.attributes)
        self.assertFalse(destination.domain.attributes)

        bow = BowVectorizer().transform(source)
        computed = destination.transform(bow.domain)

        self.assertEqual(bow.domain.attributes, computed.domain.attributes)
Exemple #18
0
def add_embedding(corpus: Corpus) -> Corpus:
    transformed_corpus = BowVectorizer().transform(corpus)

    pca = PCA(n_components=2)
    pca_model = pca(transformed_corpus)
    projection = pca_model(transformed_corpus)

    domain = Domain(
        transformed_corpus.domain.attributes,
        transformed_corpus.domain.class_vars,
        chain(transformed_corpus.domain.metas, projection.domain.attributes))
    return corpus.transform(domain)
Exemple #19
0
def infer_ngrams_corpus(corpus, return_dict=False):

    bow_features = [(i, attribute.name)
                    for i, attribute in enumerate(corpus.domain.attributes)
                    if 'bow-feature' in attribute.attributes]
    if len(bow_features) == 0:
        corpus = BowVectorizer().transform(corpus)
        bow_features = [(i, attribute.name)
                        for i, attribute in enumerate(corpus.domain.attributes)
                        if 'bow-feature' in attribute.attributes]

    feature_presence = corpus.X.sum(axis=0)
    keep = [(i, a) for i, a in bow_features if feature_presence[0, i] > 0]
    # sort features by the order in the dictionary
    dictionary = Dictionary(corpus.ngrams_iterator(include_postags=True),
                            prune_at=None)
    idx_of_keep = np.argsort([dictionary.token2id[a] for _, a in keep])
    keep = [keep[i][0] for i in idx_of_keep]
    result = Sparse2Corpus(corpus.X[:, keep].T)

    return (result, dictionary) if return_dict else result
    def test_result(self):
        pp = PreprocessorList([BASE_TRANSFORMER, RegexpTokenizer()])
        corpus = pp(Corpus.from_file("book-excerpts")[::3])
        vect = BowVectorizer()
        corpus_vect = vect.transform(corpus)

        words = ["beheld", "events", "dragged", "basin", "visit", "have"]
        d = Domain([corpus_vect.domain[w] for w in words])
        corpus_vect = corpus_vect.transform(d)

        self.send_signal(self.widget.Inputs.data, corpus_vect)
        self.send_signal(self.widget.Inputs.selected_data, corpus_vect[:1])
        self.wait_until_finished(timeout=100000)

        np.testing.assert_array_almost_equal(
            self.widget.results.p_values,
            [0.02128, 1, 0.04255, 0.06383, 0.08511, 0.97872],
            decimal=5,
        )
        np.testing.assert_array_almost_equal(
            self.widget.results.fdr_values,
            [0.12766, 1, 0.12766, 0.12766, 0.12766, 1],
            decimal=5,
        )
Exemple #21
0
def preprocess(corpus: Corpus) -> Corpus:
    for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
               StopwordsFilter("English"), FrequencyFilter(0.1)):
        corpus = pp(corpus)

    transformed_corpus = BowVectorizer().transform(corpus)

    pca = PCA(n_components=2)
    pca_model = pca(transformed_corpus)
    projection = pca_model(transformed_corpus)

    domain = Domain(
        transformed_corpus.domain.attributes,
        transformed_corpus.domain.class_vars,
        chain(transformed_corpus.domain.metas, projection.domain.attributes))
    return corpus.transform(domain)
    def test_tfidf_correctness(self):
        """
        Test if computed tf-ids are correct for train and test dataset
        When computing tf-idf on the training dataset (from compute values)
        weights (idf) must be computed based on numbers on training dataset
        """
        bow = BowVectorizer(wglobal=BowVectorizer.IDF).transform(
            self.small_corpus_train)

        document_appearance = (self.train_counts != 0).sum(0)
        n = len(self.train_counts)
        idfs_train = self.train_counts * np.log(n / document_appearance)
        self.assert_bow_same(bow, idfs_train, self.terms)

        bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
        # weights computed based on numbers from training dataset
        idfs_test = self.test_counts * np.log(n / document_appearance)
        self.assert_bow_same(bow_test, idfs_test, self.terms)
    def test_args(self):
        corpus = Corpus.from_file('deerwester')

        BowVectorizer.wglobals['const'] = lambda df, N: 1

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')

        self.assertEqualCorpus(vect.transform(corpus),
                               BowVectorizer(wlocal=BowVectorizer.COUNT).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.BINARY,
                             wglobal='const')
        self.assertEqualCorpus(vect.transform(corpus),
                               BowVectorizer(wlocal=BowVectorizer.BINARY).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.L1,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')
        x = vect.transform(corpus).X
        self.assertAlmostEqual(abs(x.sum(axis=1) - 1).sum(), 0)

fp = lambda score: "%0.5f" % score if score > 10e-3 else "%0.1e" % score
fpt = lambda score: "%0.9f" % score if score > 10e-3 else "%0.5e" % score


class EATreeWidgetItem(QTreeWidgetItem):
    def __init__(self, word, p_value, f_value, parent):
        super().__init__(parent)
        self.data = [word, p_value, f_value]
        self.setText(0, word)
        self.setText(1, fp(p_value))
        self.setToolTip(1, fpt(p_value))
        self.setText(2, fp(f_value))
        self.setToolTip(2, fpt(f_value))

    def __lt__(self, other):
        col = self.treeWidget().sortColumn()
        return self.data[col] < other.data[col]


if __name__ == '__main__':
    from orangewidget.utils.widgetpreview import WidgetPreview
    from orangecontrib.text.vectorization import BowVectorizer

    corpus = Corpus.from_file('book-excerpts')
    vect = BowVectorizer()
    corpus_vect = vect.transform(corpus)
    WidgetPreview(OWWordEnrichment).run(set_data_selected=corpus_vect[:10],
                                        set_data=corpus_vect)
 def setUp(self):
     self.widget = self.create_widget(OWWordEnrichment)
     corpus = Corpus.from_file('book-excerpts')[::3]
     vect = BowVectorizer()
     self.corpus_vect = vect.transform(corpus)
     self.subset_corpus = self.corpus_vect[:5]
Exemple #26
0
    def onDeleteWidget(self):
        self.shutdown()
        super().onDeleteWidget()


if __name__ == "__main__":
    from Orange.projection import PCA
    from Orange.widgets.utils.widgetpreview import WidgetPreview
    from orangecontrib.text.preprocess import LowercaseTransformer, \
        RegexpTokenizer, StopwordsFilter, FrequencyFilter
    from orangecontrib.text.vectorization import BowVectorizer

    corpus_ = Corpus.from_file("book-excerpts")
    for pp in (LowercaseTransformer(), RegexpTokenizer(r"\w+"),
               StopwordsFilter("English"), FrequencyFilter(0.1)):
        corpus_ = pp(corpus_)

    transformed_corpus = BowVectorizer().transform(corpus_)

    pca = PCA(n_components=2)
    pca_model = pca(transformed_corpus)
    projection = pca_model(transformed_corpus)

    domain_ = Domain(
        transformed_corpus.domain.attributes,
        transformed_corpus.domain.class_vars,
        chain(transformed_corpus.domain.metas, projection.domain.attributes))
    corpus_ = corpus_.transform(domain_)

    WidgetPreview(OWAnnotator).run(set_data=corpus_)
 def test_binary(self):
     vect = BowVectorizer(wlocal=BowVectorizer.BINARY)
     corpus = Corpus.from_file('deerwester')
     result = vect.transform(corpus)
     self.assertEqual(result.X.max(), 1.)
    def test_args(self):
        corpus = Corpus.from_file('deerwester')

        BowVectorizer.wglobals['const'] = lambda df, N: 1

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')

        self.assertEqualCorpus(
            vect.transform(corpus),
            BowVectorizer(wlocal=BowVectorizer.COUNT).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.BINARY,
                             wglobal='const')
        self.assertEqualCorpus(
            vect.transform(corpus),
            BowVectorizer(wlocal=BowVectorizer.BINARY).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.L1,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')
        x = vect.transform(corpus).X
        self.assertAlmostEqual(abs(x.sum(axis=1) - 1).sum(), 0)
 def test_report(self):
     vect = BowVectorizer()
     self.assertGreater(len(vect.report()), 0)
 def test_report(self):
     vect = BowVectorizer()
     self.assertGreater(len(vect.report()), 0)
    def test_empty_tokens(self):
        corpus = Corpus.from_file('deerwester')
        corpus.text_features = []
        bag_of_words = BowVectorizer().transform(corpus, copy=False)

        self.assertIs(corpus, bag_of_words)
Exemple #32
0
 def setUp(self):
     # type: OWWordEnrichment
     self.widget = self.create_widget(OWWordEnrichment)
     self.corpus = Corpus.from_file('book-excerpts')
     vect = BowVectorizer()
     self.corpus_vect = vect.transform(self.corpus)
 def test_binary(self):
     vect = BowVectorizer(wlocal=BowVectorizer.BINARY)
     corpus = Corpus.from_file('deerwester')
     result = vect.transform(corpus)
     self.assertEqual(result.X.max(), 1.)
Exemple #34
0
 def ngrams_corpus(self):
     if self._ngrams_corpus is None:
         return BowVectorizer().transform(self).ngrams_corpus
     return self._ngrams_corpus