Ejemplo n.º 1
0
    def open_file(self, path=None, data=None):
        self.closeContext()
        self.Error.clear()
        self.unused_attrs_model[:] = []
        self.used_attrs_model[:] = []
        if data:
            self.corpus = Corpus.from_table(data.domain, data)
        elif path:
            try:
                self.corpus = Corpus.from_file(path)
                self.corpus.name = os.path.splitext(os.path.basename(path))[0]
            except BaseException as err:
                self.Error.read_file(path, str(err))
        else:
            return

        self.update_info()
        self.used_attrs = list(self.corpus.text_features)
        if not self.corpus.text_features:
            self.Error.corpus_without_text_features()
            self.Outputs.corpus.send(None)
            return
        self.openContext(self.corpus)
        self.used_attrs_model.extend(self.used_attrs)
        self.unused_attrs_model.extend(
            [f for f in self.corpus.domain.metas
             if f.is_string and f not in self.used_attrs_model])
Ejemplo n.º 2
0
    def test_compute_values(self):
        corpus = Corpus.from_file('deerwester')
        vect = BowVectorizer()

        bow = vect.transform(corpus)
        computed = Corpus.from_table(bow.domain, corpus)

        self.assertEqual(bow.domain, computed.domain)
        self.assertEqual((bow.X != computed.X).nnz, 0)
Ejemplo n.º 3
0
    def test_infer_text_features(self):
        c = Corpus.from_file('friends-transcripts')
        tf = c.text_features
        self.assertEqual(len(tf), 1)
        self.assertEqual(tf[0].name, 'Quote')

        c = Corpus.from_file('deerwester')
        tf = c.text_features
        self.assertEqual(len(tf), 1)
        self.assertEqual(tf[0].name, 'Text')
Ejemplo n.º 4
0
    def test_compute_values_to_different_domain(self):
        source = Corpus.from_file('deerwester')
        destination = Corpus.from_file('book-excerpts')

        self.assertFalse(source.domain.attributes)
        self.assertFalse(destination.domain.attributes)

        bow = BowVectorizer().transform(source)
        computed = destination.transform(bow.domain)

        self.assertEqual(bow.domain.attributes, computed.domain.attributes)
Ejemplo n.º 5
0
    def test_corpus_from_file(self):
        c = Corpus.from_file('book-excerpts')
        self.assertEqual(len(c), 140)
        self.assertEqual(len(c.domain), 1)
        self.assertEqual(len(c.domain.metas), 1)
        self.assertEqual(c.metas.shape, (140, 1))

        c = Corpus.from_file('deerwester')
        self.assertEqual(len(c), 9)
        self.assertEqual(len(c.domain), 1)
        self.assertEqual(len(c.domain.metas), 1)
        self.assertEqual(c.metas.shape, (9, 1))
Ejemplo n.º 6
0
def main():
    from Orange.data import Table, Domain, ContinuousVariable, StringVariable

    words = 'hey~mr. tallyman tally~me banana daylight come and me wanna go home'
    words = np.array([w.replace('~', ' ') for w in words.split()], dtype=object, ndmin=2).T
    weights = np.random.random((len(words), 1))

    data = np.zeros((len(words), 0))
    metas = []
    for i, w in enumerate(weights.T):
        data = np.column_stack((data, words, w))
        metas = metas + [StringVariable('Topic' + str(i)),
                         ContinuousVariable('weights')]
    domain = Domain([], metas=metas)
    table = Table.from_numpy(domain,
                             X=np.zeros((len(words), 0)),
                             metas=data)
    app = QtGui.QApplication([''])
    w = OWWordCloud()
    w.on_topics_change(table)
    domain = Domain([], metas=[StringVariable('text')])
    data = Corpus.from_numpy(domain, X=np.zeros((1, 0)), metas=np.array([[' '.join(words.flat)]]))
    w.on_corpus_change(data)
    w.show()
    app.exec()
Ejemplo n.º 7
0
    def test_transform(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        self.assertIsInstance(result, Corpus)
        self.assertEqual(len(result.domain), 43)
Ejemplo n.º 8
0
    def test_corpus_from_file(self):
        c = Corpus.from_file(os.path.join(DATASET_PATH, 'bookexcerpts.txt'))
        self.assertEqual(len(c), 140)

        self.assertEqual(len(c.domain), 0)
        self.assertEqual(len(c.domain.metas), 2)
        self.assertEqual(c.metas.shape, (140, 2))
Ejemplo n.º 9
0
    def test_corpus_from_file_just_text(self):
        c = Corpus.from_file(os.path.join(DATASET_PATH, 'deerwester.tab'))

        self.assertEqual(len(c), 9)
        self.assertEqual(len(c.domain), 0)
        self.assertEqual(len(c.domain.metas), 1)
        self.assertEqual(c.metas.shape, (9, 1))
Ejemplo n.º 10
0
    def test_documents(self):
        c = Corpus.from_file('book-excerpts')
        docs = c.documents
        types = set(type(i) for i in docs)

        self.assertEqual(len(docs), len(c))
        self.assertEqual(len(types), 1)
        self.assertIn(str, types)
Ejemplo n.º 11
0
    def test_corpus_not_eq(self):
        c = Corpus.from_file('book-excerpts')
        n_doc = c.X.shape[0]

        c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, [])
        self.assertNotEqual(c, c2)

        c2 = Corpus(c.domain, np.ones((n_doc, 1)), c.Y, c.metas, c.W, c.text_features)
        self.assertNotEqual(c, c2)

        c2 = Corpus(c.domain, c.X, np.ones((n_doc, 1)), c.metas, c.W, c.text_features)
        self.assertNotEqual(c, c2)

        broken_metas = np.copy(c.metas)
        broken_metas[0, 0] = ''
        c2 = Corpus(c.domain, c.X, c.Y, broken_metas, c.W, c.text_features)
        self.assertNotEqual(c, c2)

        new_meta = [StringVariable('text2')]
        broken_domain = Domain(c.domain.attributes, c.domain.class_var, new_meta)
        c2 = Corpus(broken_domain, c.X, c.Y, c.metas, c.W, new_meta)
        self.assertNotEqual(c, c2)

        c2 = c.copy()
        c2.ngram_range = (2, 4)
        self.assertNotEqual(c, c2)
Ejemplo n.º 12
0
    def test_create_bow(self):
        corpus = Corpus.from_file('deerwester')
        bag_of_words = self.bow(corpus, use_tfidf=True)

        self.assertIsNotNone(bag_of_words.X)
        self.assertEqual(9, bag_of_words.X.shape[0])
        self.assertEqual(42, bag_of_words.X.shape[1])
        self.assertEqual(self.progress_callbacks, 4)
        self.assertEqual(self.error_callbacks, 0)
Ejemplo n.º 13
0
 def test_POSTagger(self):
     corpus = Corpus.from_file('deerwester')
     tagger = tag.AveragedPerceptronTagger()
     result = tagger.tag_corpus(corpus)
     self.assertTrue(hasattr(result, 'pos_tags'))
     # for token in itertools.chain(*result.tokens):
     #     self.assertRegexpMatches(token, '[a-z]+_[A-Z]+')
     for tokens, tags in zip(result.tokens, result.pos_tags):
         self.assertEqual(len(tokens), len(tags))
Ejemplo n.º 14
0
 def set_data(self, data=None):
     self.reset_widget()
     self.corpus = data
     if data is not None:
         if not isinstance(data, Corpus):
             self.corpus = Corpus.from_table(data.domain, data)
         self.load_features()
         self.regenerate_docs()
     self.commit()
Ejemplo n.º 15
0
    def test_from_table(self):
        t = Table.from_file('brown-selected')
        self.assertIsInstance(t, Table)

        c = Corpus.from_table(t.domain, t)
        self.assertIsInstance(c, Corpus)
        self.assertEqual(len(t), len(c))
        np.testing.assert_equal(t.metas, c.metas)
        self.assertEqual(c.text_features, [t.domain.metas[0]])
Ejemplo n.º 16
0
 def test_empty_corpus(self):
     """
     Empty data.
     GH-247
     """
     corpus = Corpus.from_file("deerwester")[:0]
     vect = BowVectorizer(norm=BowVectorizer.L1)
     out = vect.transform(corpus)
     self.assertEqual(out, corpus)
Ejemplo n.º 17
0
 def on_data(self, data):
     if data and not isinstance(data, Corpus):
         data = Corpus.from_table(data.domain, data)
     self.data = data
     self._repopulate_attr_combo(data)
     if not data:
         self.region_selected('')
         QTimer.singleShot(0, lambda: self.webview.evalJS('DATA = {}; renderMap();'))
     else:
         QTimer.singleShot(0, self.on_attr_change)
Ejemplo n.º 18
0
    def test_init_preserve_shape_of_empty_x(self):
        c = Corpus.from_file('book-excerpts')
        d = c.domain
        new_domain = Domain((ContinuousVariable('c1'),), d.class_vars, d.metas)

        empty_X = csr_matrix((len(c), 1))
        new = Corpus(new_domain, X=empty_X, Y=c.Y, metas=c.metas)

        self.assertEqual(empty_X.nnz, 0)
        self.assertEqual(new.X.shape, empty_X.shape)
Ejemplo n.º 19
0
 def set_data(self, data=None):
     self.reset_widget()  # Clear any old data.
     if data is not None:
         self.corpus = data
         if isinstance(data, Table):
             self.corpus = Corpus.from_table(data.domain, data)
         self.load_features()
         self.regenerate_documents()
         # Send the corpus to output.
         self.send(Output.CORPUS, self.corpus)
Ejemplo n.º 20
0
    def test_compute_values_to_different_domain(self):
        destination = Corpus.from_file('andersen')

        self.assertFalse(self.corpus.domain.attributes)
        self.assertFalse(destination.domain.attributes)

        sentiment = self.method.transform(self.corpus)
        computed = destination.transform(sentiment.domain)

        self.assertTrue(sentiment.domain.attributes)
        self.assertEqual(sentiment.domain.attributes, computed.domain.attributes)
Ejemplo n.º 21
0
    def test_documents_from_features(self):
        c = Corpus.from_file('book-excerpts')
        docs = c.documents_from_features([c.domain.class_var])
        types = set(type(i) for i in docs)

        self.assertTrue(all(
            [sum(cls in doc for cls in c.domain.class_var.values) == 1
             for doc in docs]))
        self.assertEqual(len(docs), len(c))
        self.assertEqual(len(types), 1)
        self.assertIn(str, types)
Ejemplo n.º 22
0
 def test_ngrams(self):
     vect = BowVectorizer()
     corpus = Corpus.from_file('deerwester')
     pr = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+'),
                                  ngrams_range=(1, 3))
     pr(corpus, inplace=True)
     result = vect.transform(corpus)
     attrs = [attr.name for attr in result.domain.attributes]
     self.assertIn(corpus.tokens[0][1], attrs)
     self.assertIn(' '.join(corpus.tokens[0][:2]), attrs)
     self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
Ejemplo n.º 23
0
 def test_corpus_remove_text_features(self):
     """
     Remove those text features which do not have a column in metas.
     GH-324
     GH-325
     """
     c = Corpus.from_file('deerwester')
     domain = Domain(attributes=c.domain.attributes, class_vars=c.domain.class_vars)
     d = c.transform(domain)
     self.assertFalse(len(d.text_features))
     # Make sure that copying works.
     d.copy()
Ejemplo n.º 24
0
    def search(self, query, date_from=None, date_to=None, max_docs=None,
               on_progress=None, should_break=None):
        """
        Args:
            query (str): Search query.
            date_from (date): Start date limit.
            date_to (date): End date limit.
            max_docs (int): Maximal number of documents returned.
            on_progress (callback): Called after every iteration of downloading.
            should_break (callback): Callback for breaking the computation before the end.
                If it evaluates to True, downloading is stopped and document downloaded till now
                are returned in a Corpus.

        Returns:
            Corpus: Search results.
        """
        if not self.api_key_valid():
            raise RuntimeError('The API key is not valid.')
        if max_docs is None or max_docs > MAX_DOCS:
            max_docs = MAX_DOCS

        # TODO create corpus on the fly and extend, so it stops faster.
        records = []
        data, cached = self._fetch_page(query, date_from, date_to, 0)
        if data is None:
            return None
        records.extend(data['response']['docs'])
        max_docs = min(data['response']['meta']['hits'], max_docs)
        if callable(on_progress):
            on_progress(len(records), max_docs)

        for page in range(1, math.ceil(max_docs/BATCH_SIZE)):
            if callable(should_break) and should_break():
                break

            data, cached = self._fetch_page(query, date_from, date_to, page)

            if data is None:
                break

            records.extend(data['response']['docs'])

            if callable(on_progress):
                on_progress(len(records), max_docs)

            if not cached:
                sleep(SLEEP)

        if len(records) > max_docs:
            records = records[:max_docs]

        return Corpus.from_documents(records, 'NY Times', self.attributes,
                                     self.class_vars, self.metas, title_indices=[-1])
Ejemplo n.º 25
0
 def test_corpus_not_eq(self):
     c = Corpus.from_file('bookexcerpts')
     c2 = Corpus(c.documents[:-1], c.X, c.Y, c.metas, c.domain)
     self.assertNotEqual(c, c2)
     c2 = Corpus(c.documents, np.vstack((c.X, c.X)), c.Y, c.metas, c.domain)
     self.assertNotEqual(c, c2)
     c2 = Corpus(c.documents, c.X, np.vstack((c.Y, c.Y)), c.metas, c.domain)
     self.assertNotEqual(c, c2)
     c2 = Corpus(c.documents, c.X, c.Y, c.metas.T, c.domain)
     self.assertNotEqual(c, c2)
     broken_domain = Domain(c.domain.attributes, c.domain.class_var, [StringVariable('text2')])
     c2 = Corpus(c.documents, c.X, c.Y, c.metas, broken_domain)
     self.assertNotEqual(c, c2)
Ejemplo n.º 26
0
    def test_copy(self):
        corpus = Corpus.from_file('deerwester')

        p = preprocess.Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+\s}'))
        copied = corpus.copy()
        p(copied, inplace=True)
        self.assertIsNot(copied, corpus)
        self.assertNotEqual(copied, corpus)

        p(corpus, inplace=True)
        copied = corpus.copy()
        self.assertIsNot(copied, corpus)
        self.assertEqual(copied, corpus)
Ejemplo n.º 27
0
    def test_extend_corpus(self):
        c = Corpus.from_file('book-excerpts')
        n_classes = len(c.domain.class_var.values)
        c_copy = c.copy()
        new_y = [c.domain.class_var.values[int(i)] for i in c.Y]
        new_y[0] = 'teenager'
        c.extend_corpus(c.metas, new_y)

        self.assertEqual(len(c), len(c_copy)*2)
        self.assertEqual(c.Y.shape[0], c_copy.Y.shape[0]*2)
        self.assertEqual(c.metas.shape[0], c_copy.metas.shape[0]*2)
        self.assertEqual(c.metas.shape[1], c_copy.metas.shape[1])
        self.assertEqual(len(c_copy.domain.class_var.values), n_classes+1)
Ejemplo n.º 28
0
    def test_domain(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        attrs = [attr.name for attr in result.domain.attributes]
        self.assertEqual(attrs, sorted(attrs))

        X = result.X.toarray()
        for i in range(len(corpus)):
            for contains, attr in zip(X[i], attrs):
                if contains > .001:
                    self.assertIn(attr, corpus.tokens[i])
Ejemplo n.º 29
0
    def test_extend_corpus(self):
        c = Corpus.from_file(os.path.join(DATASET_PATH, 'bookexcerpts.tab'))
        n_classes = len(c.domain.class_var.values)
        c_copy = c.copy()
        new_y = [c.domain.class_var.values[int(i)] for i in c.Y]
        new_y[0] = 'teenager'
        c.extend_corpus(c.documents, c.metas, new_y)

        self.assertEqual(len(c), len(c_copy)*2)
        self.assertEqual(c.Y.shape[0], c_copy.Y.shape[0]*2)
        self.assertEqual(c.metas.shape[0], c_copy.metas.shape[0]*2)
        self.assertEqual(c.metas.shape[1], c_copy.metas.shape[1])
        self.assertEqual(len(c_copy.domain.class_var.values), n_classes+1)
Ejemplo n.º 30
0
 def open_file(self, path):
     self.Error.read_file.clear()
     self.used_attrs[:] = []
     self.unused_attrs[:] = []
     if path:
         try:
             self.corpus = Corpus.from_file(path)
             self.corpus.name = os.path.splitext(os.path.basename(path))[0]
             self.info_label.setText("Corpus of {} documents.".format(len(self.corpus)))
             self.used_attrs.extend(self.corpus.text_features)
             self.unused_attrs.extend([f for f in self.corpus.domain.metas
                                       if f.is_string and f not in self.corpus.text_features])
         except BaseException as err:
             self.Error.read_file(path, str(err))
Ejemplo n.º 31
0
 def test_set_text_features(self):
     c = Corpus.from_file('friends-transcripts')[:100]
     c2 = c.copy()
     self.assertEqual(c.set_text_features(None), c2._infer_text_features())
Ejemplo n.º 32
0
 def test_empty_corpus(self):
     corpus = Corpus.from_file('deerwester')[:0]
     sentiment = self.method.transform(corpus)
     self.assertEqual(len(sentiment.domain),
                      len(self.corpus.domain) + self.new_cols)
     self.assertEqual(len(sentiment), 0)
Ejemplo n.º 33
0
 def test_has_tokens(self):
     corpus = Corpus.from_file('deerwester')
     self.assertFalse(corpus.has_tokens())
     corpus.store_tokens(corpus.tokens)   # default tokenizer
     self.assertTrue(corpus.has_tokens())
Ejemplo n.º 34
0
        self.profiler.new_token()
        self.token = self.profiler.token
        self.refresh_token_info()
        self.commit()

    def token_changed(self):
        self.profiler.token = self.token
        self.refresh_token_info()
        self.commit()

    def refresh_token_info(self):
        self.credit = str(self.profiler.get_credit())

    def send_report(self):
        self.report_items([
            ('Documents', self.n_documents),
            ('Attribute', self.strings_attrs[self.tweet_attr]
             if len(self.strings_attrs) > self.tweet_attr else ''),
            ('Emotions', self.model_name),
            ('Output', self.output_mode),
        ])


if __name__ == '__main__':
    app = QtGui.QApplication([])
    corpus = Corpus.from_file('Election-2016-Tweets.tab')
    widget = OWTweetProfiler()
    widget.set_corpus(corpus[:100])
    widget.show()
    app.exec()
Ejemplo n.º 35
0
 def setUp(self):
     self.widget = self.create_widget(OWTBagOfWords)
     self.corpus = Corpus.from_file('deerwester')
Ejemplo n.º 36
0
"""
input: Corpus preprocessed with Preprocess Text. Tokenizer is set to Sentences.
output: Corpus where sentences are now documents.
requires: Text add-on
"""

import numpy as np
from Orange.data import Domain, StringVariable
from orangecontrib.text.corpus import Corpus

tokens = in_data.tokens
title = [i for i in in_data.domain.metas if "title" in i.attributes][0]
new_domain = Domain(attributes=[], metas=[StringVariable('Sentences'),
                                          title)

titles = []
content = []


for i, doc in enumerate(tokens):
    for t in doc:
        titles.append(in_data[i][title.name].value)
        content.append(t)

metas = np.column_stack((content, titles))
out_data = Corpus.from_numpy(domain=new_domain, X=np.empty((len(content), 0)),
                             metas=metas)
out_data.set_text_features([StringVariable('Sentences')])
out_data.set_title_variable(title)
Ejemplo n.º 37
0
 def setUp(self):
     self.corpus = Corpus.from_file('deerwester')
     self.method = SentiArt()
     self.new_cols = 7
Ejemplo n.º 38
0
 def test_extend_corpus_non_empty_X(self):
     c = Corpus.from_file('election-tweets-2016')[:10]
     with self.assertRaises(ValueError):
         c.extend_corpus(c.metas, c.Y)
Ejemplo n.º 39
0
 def setUp(self):
     self.corpus = Corpus.from_file("deerwester")
     self.widget = self.create_widget(OWTopicModeling)
Ejemplo n.º 40
0
                            f"{cc_len}"
            input_string = (f"{cor_output_len or 0} documents\n"
                            f"{n_selected or 0} selected words\n"
                            f"{cc_len} words with counts")
            self.info.set_output_summary(input_numbers, input_string)

    def send_report(self):
        if self.webview:
            html = self.webview.html()
            start = html.index(">", html.index("<body")) + 1
            end = html.index("</body>")
            body = html[start:end]
            # create an empty div of appropriate height to compensate for
            # absolute positioning of words in the html
            height = self.webview._evalJS(
                "document.getElementById('canvas').clientHeight")
            self.report_html += "<div style='position: relative; height: " \
                                f"{height}px;'>{body}</div>"

            self.report_table(self.tableview)

    def sizeHint(self) -> QtCore.QSize:
        return super().sizeHint().expandedTo(QSize(900, 500))


if __name__ == "__main__":
    from orangewidget.utils.widgetpreview import WidgetPreview

    corpus = Corpus.from_file("book-excerpts")
    WidgetPreview(OWWordCloud).run(corpus)
Ejemplo n.º 41
0
 def setUp(self):
     self.widget = self.create_widget(OWConcordance)  # type: OWConcordance
     self.corpus = Corpus.from_file('deerwester')
Ejemplo n.º 42
0
 def setUp(self):
     self.widget = self.create_widget(OWSentimentAnalysis)
     self.corpus = Corpus.from_file('deerwester')
Ejemplo n.º 43
0
                           ("Matching documents",
                            self.n_matching), ("Matches", self.n_matches)))

    def showEvent(self, event):
        super().showEvent(event)
        self.update_splitter()

    def update_splitter(self):
        """
        Update splitter that document list on the left never take more
        than 1/3 of the space. It is only set on showEvent. If user
        later changes sizes it stays as it is.
        """
        w1, w2 = self.splitter.sizes()
        ws = w1 + w2
        if w2 < 2 / 3 * ws:
            self.splitter.setSizes([int(ws * 1 / 3), int(ws * 2 / 3)])


if __name__ == '__main__':
    from orangecontrib.text.preprocess import BASE_TOKENIZER
    from orangecontrib.text.tag.pos import AveragedPerceptronTagger
    from orangewidget.utils.widgetpreview import WidgetPreview

    corpus = Corpus.from_file('book-excerpts')
    corpus = corpus[:3]
    tagger = AveragedPerceptronTagger()
    tagged_corpus = tagger(BASE_TOKENIZER(corpus))
    tagged_corpus.ngram_range = (1, 2)
    WidgetPreview(OWCorpusViewer).run(tagged_corpus)
Ejemplo n.º 44
0
 def setUp(self):
     self.corpus = Corpus.from_file('deerwester')
Ejemplo n.º 45
0
    def test_corpus_not_eq(self):
        c = Corpus.from_file('book-excerpts')
        n_doc = c.X.shape[0]

        c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, [])
        self.assertNotEqual(c, c2)

        c2 = Corpus(c.domain, np.ones((n_doc, 1)), c.Y, c.metas, c.W, c.text_features)
        self.assertNotEqual(c, c2)

        c2 = Corpus(c.domain, c.X, np.ones((n_doc, 1)), c.metas, c.W, c.text_features)
        self.assertNotEqual(c, c2)

        broken_metas = np.copy(c.metas)
        broken_metas[0, 0] = ''
        c2 = Corpus(c.domain, c.X, c.Y, broken_metas, c.W, c.text_features)
        self.assertNotEqual(c, c2)

        new_meta = [StringVariable('text2')]
        broken_domain = Domain(c.domain.attributes, c.domain.class_var, new_meta)
        c2 = Corpus(broken_domain, c.X, c.Y, c.metas, c.W, new_meta)
        self.assertNotEqual(c, c2)

        c2 = c.copy()
        c2.ngram_range = (2, 4)
        self.assertNotEqual(c, c2)
Ejemplo n.º 46
0
    def setUp(self):
        self.widget = self.create_widget(OWWordCloud)
        self.corpus = Corpus.from_file('deerwester')

        self.topic = self.create_topic()
Ejemplo n.º 47
0
 def test_binary(self):
     vect = BowVectorizer(wlocal=BowVectorizer.BINARY)
     corpus = Corpus.from_file('deerwester')
     result = vect.transform(corpus)
     self.assertEqual(result.X.max(), 1.)
Ejemplo n.º 48
0
    def test_compute_values(self):
        sentiment = self.method.transform(self.corpus)
        computed = Corpus.from_table(sentiment.domain, self.corpus)

        self.assertEqual(sentiment.domain, computed.domain)
        self.assertTrue((sentiment.X == computed.X).all())
Ejemplo n.º 49
0
                                      QtGui.QPalette.HighlightedText))

        textRect = style.subElementRect(QStyle.SE_ItemViewItemText, options)
        painter.save()
        painter.translate(textRect.topLeft())
        painter.setClipRect(textRect.translated(-textRect.topLeft()))
        doc.documentLayout().draw(painter, ctx)

        painter.restore()

    def sizeHint(self, option, index):
        options = QStyleOptionViewItem(option)
        self.initStyleOption(options, index)

        doc = QtGui.QTextDocument()
        doc.setHtml(options.text)
        doc.setTextWidth(options.rect.width())
        return QtCore.QSize(doc.idealWidth(), doc.size().height())


if __name__ == '__main__':
    from AnyQt.QtWidgets import QApplication

    app = QApplication([])
    widget = OWTopicModeling()
    # widget.set_data(Corpus.from_file('book-excerpts'))
    widget.set_data(Corpus.from_file('deerwester'))
    widget.show()
    app.exec()
    widget.saveSettings()
Ejemplo n.º 50
0
    def test_empty_tokens(self):
        corpus = Corpus.from_file('deerwester')
        corpus.text_features = []
        bag_of_words = BowVectorizer().transform(corpus, copy=False)

        self.assertIs(corpus, bag_of_words)
Ejemplo n.º 51
0
 def setUp(self):
     self.corpus = Corpus.from_file('deerwester')
     self.method = MultiSentiment()
     self.new_cols = 1
Ejemplo n.º 52
0
 def test_corpus_from_file_with_tab(self):
     c = Corpus.from_file('book-excerpts')
     c2 = Corpus.from_file('book-excerpts.tab')
     self.assertEqual(c, c2)
Ejemplo n.º 53
0
 def setUp(self):
     self.corpus = Corpus.from_file('deerwester')
     self.method = LiuHuSentiment('English')
     self.new_cols = 1
Ejemplo n.º 54
0
 def test_corpus_from_file_abs_path(self):
     c = Corpus.from_file('book-excerpts')
     path = os.path.dirname(__file__)
     file = os.path.abspath(os.path.join(path, '..', 'datasets', 'book-excerpts.tab'))
     c2 = Corpus.from_file(file)
     self.assertEqual(c, c2)
Ejemplo n.º 55
0
 def test_corpus_from_init(self):
     c = Corpus.from_file('book-excerpts')
     c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.text_features)
     self.assertEqual(c, c2)
Ejemplo n.º 56
0
    def create_configuration_layout(self):
        layout = QFormLayout()

        spin = gui.spin(self,
                        self,
                        "f",
                        minv=8,
                        maxv=SimhashVectorizer.max_f,
                        step=8)
        spin.editingFinished.connect(self.f_spin_changed)
        layout.addRow('Simhash size:', spin)

        spin = gui.spin(self, self, 'shingle_len', minv=1, maxv=100)
        spin.editingFinished.connect(self.on_change)
        layout.addRow('Shingle length:', spin)
        return layout

    def init_method(self):
        return self.Method(shingle_len=self.shingle_len, f=self.f)

    def f_spin_changed(self):
        # simhash needs f value to be multiple of 8, correct if it is not
        self.f = 8 * round(self.f / 8)
        self.on_change()


if __name__ == '__main__':
    from orangewidget.utils.widgetpreview import WidgetPreview

    WidgetPreview(OWSimhash).run(Corpus.from_file("book-excerpts"))
Ejemplo n.º 57
0
 def setUp(self):
     self.widget = self.create_widget(OWCorpusViewer)
     self.corpus = Corpus.from_file('deerwester')
Ejemplo n.º 58
0
 def test_corpus_from_file_missing(self):
     with self.assertRaises(FileNotFoundError):
         Corpus.from_file('missing_file')
Ejemplo n.º 59
0
    def sendData(self):
        """Convert input(s) and send output"""
        if not (self.segmentation or self.corpus):
            self.infoBox.setText(u'Widget needs input.', 'warning')
            self.send('Textable segmentation', None, self)
            self.send('Text Mining corpus', None)
            return

        msg_seg = msg_corpus = ""

        num_iterations = 0
        if self.corpus:
            num_iterations += len(self.corpus)
        if self.segmentation:
            num_iterations += len(self.segmentation)
        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=num_iterations)

        # Convert corpus to segmentation...
        if self.corpus:
            self.clearCreatedInputs()
            new_segments = list()
            text_feature = self.corpus.text_features[self.segmentContent]
            for row in self.corpus:
                content = row[text_feature].value
                if content == "":
                    continue
                new_input = Input(row[text_feature].value)
                new_segment_annotations = dict()
                for attr in self.corpus.domain:
                    attr_str = str(row[attr])
                    if attr_str != "?":
                        new_segment_annotations[str(attr)] = attr_str
                for meta_attr in self.corpus.domain.metas:
                    meta_attr_str = str(row[meta_attr])
                    if (meta_attr != text_feature and meta_attr_str != "?"):
                        new_segment_annotations[str(meta_attr)] = meta_attr_str
                new_segments.append(
                    Segment(new_input[0].str_index, new_input[0].start,
                            new_input[0].end, new_segment_annotations))
                self.createdInputs.append(new_input)
                progressBar.advance()
            new_segmentation = Segmentation(new_segments, self.captionTitle)
            msg_seg = u'%i segment@p' % len(new_segmentation)
            msg_seg = pluralize(msg_seg, len(new_segmentation))
            self.send('Textable segmentation', new_segmentation)
        else:
            self.send('Textable segmentation', None)

        # Convert segmentation to corpus...
        if self.segmentation:
            metas = list()
            attributes = list()
            meta_keys = list()
            attribute_keys = list()
            for key in self.segmentation.get_annotation_keys():
                possible_values = set()
                for segment in self.segmentation:
                    try:
                        possible_values.add(str(segment.annotations[key]))
                    except KeyError:
                        pass
                if (self.limitNumCategories
                        and len(possible_values) > self.maxNumCategories):
                    metas.append(StringVariable(key))
                    meta_keys.append(key)
                else:
                    attributes.append(
                        DiscreteVariable(key, values=list(possible_values)))
                    attribute_keys.append(key)
            metas.append(StringVariable("textable_text"))
            domain = Domain(attributes, [], metas)
            rows = list()
            for segment in self.segmentation:
                row = [
                    str(segment.annotations.get(annotation_key, None))
                    for annotation_key in attribute_keys
                ]
                row.extend([
                    str(segment.annotations.get(annotation_key, None))
                    for annotation_key in meta_keys
                ])
                row.append(segment.get_content())
                rows.append(row)
                progressBar.advance
            table = Table(domain, rows)
            if textMiningIsInstalled:
                corpus = Corpus(domain,
                                X=table.X,
                                metas=table.metas,
                                text_features=[metas[-1]])
            msg_corpus = u'%i document@p' % len(self.segmentation)
            msg_corpus = pluralize(msg_corpus, len(self.segmentation))
            self.send('Text Mining corpus', corpus)
        else:
            self.send('Text Mining corpus', None)

        progressBar.finish()
        self.controlArea.setDisabled(False)

        if msg_seg or msg_corpus:
            message = msg_seg
            if msg_seg and msg_corpus:
                message += " and "
            message += msg_corpus
            message += " sent to output."
            self.infoBox.setText(message)

        self.sendButton.resetSettingsChangedFlag()
Ejemplo n.º 60
0
 def setUp(self):
     self.corpus = Corpus.from_file('slo-opinion-corpus')
     self.method = LiuHuSentiment('Slovenian')
     self.new_cols = 1