Python Corpus.Corpus Exemples, orangecontrib.text.corpus.Corpus.Corpus Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_corpus.py Projet : webisaac/orange3-text

    def test_corpus_not_eq(self):
        c = Corpus.from_file('book-excerpts')
        n_doc = c.X.shape[0]

        c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, [])
        self.assertNotEqual(c, c2)

        c2 = Corpus(c.domain, np.ones((n_doc, 1)), c.Y, c.metas, c.W,
                    c.text_features)
        self.assertNotEqual(c, c2)

        c2 = Corpus(c.domain, c.X, np.ones((n_doc, 1)), c.metas, c.W,
                    c.text_features)
        self.assertNotEqual(c, c2)

        broken_metas = np.copy(c.metas)
        broken_metas[0, 0] = ''
        c2 = Corpus(c.domain, c.X, c.Y, broken_metas, c.W, c.text_features)
        self.assertNotEqual(c, c2)

        new_meta = [StringVariable('text2')]
        broken_domain = Domain(c.domain.attributes, c.domain.class_var,
                               new_meta)
        c2 = Corpus(broken_domain, c.X, c.Y, c.metas, c.W, new_meta)
        self.assertNotEqual(c, c2)

        c2 = c.copy()
        c2.ngram_range = (2, 4)
        self.assertNotEqual(c, c2)

Exemple #2

0

Afficher le fichier

Fichier : test_corpus.py Projet : webisaac/orange3-text

    def test_asserting_errors(self):
        c = Corpus.from_file('book-excerpts')

        with self.assertRaises(TypeError):
            Corpus(1.0, c.Y, c.metas, c.domain, c.text_features)

        too_large_x = np.vstack((c.X, c.X))
        with self.assertRaises(ValueError):
            Corpus(c.domain, too_large_x, c.Y, c.metas, c.W, c.text_features)

        with self.assertRaises(ValueError):
            c.set_text_features([StringVariable('foobar')])

        with self.assertRaises(ValueError):
            c.set_text_features([c.domain.metas[0], c.domain.metas[0]])

Exemple #3

0

Afficher le fichier

Fichier : pubmed.py Projet : skribled/orange3-text

def _corpus_from_records(records, includes_metadata):
    """Receives PubMed records and transforms them into a corpus.

    Args:
        records (list): A list of PubMed entries.
        includes_metadata (list): A list of text fields to include.

    Returns:
        corpus: The output Corpus.
    """
    meta_values, class_values = _records_to_corpus_entries(
        records, includes_metadata=includes_metadata)
    meta_vars = []
    for field_name, _ in includes_metadata:
        if field_name == 'pub_date':
            meta_vars.append(TimeVariable(field_name))
        else:
            meta_vars.append(StringVariable.make(field_name))

    class_vars = [
        DiscreteVariable('section_name', values=list(set(class_values)))
    ]
    domain = Domain([], class_vars=class_vars, metas=meta_vars)

    Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

    return Corpus(domain=domain, Y=Y, metas=meta_values)

Exemple #4

0

Afficher le fichier

    def _create_corpus(self):
        corpus = None
        names = ["name", "path", "content"]
        data = []
        category_data = []
        text_categories = list(set(t.category for t in self._text_data))
        values = list(set(text_categories))
        category_var = DiscreteVariable.make("category", values=values)
        for textdata in self._text_data:
            data.append([textdata.name, textdata.path, textdata.content])
            category_data.append(category_var.to_val(textdata.category))
        if len(text_categories) > 1:
            category_data = np.array(category_data)
        else:
            category_var = []
            category_data = np.empty((len(data), 0))
        domain = Domain([], category_var,
                        [StringVariable.make(name) for name in names])
        domain["name"].attributes["title"] = True
        data = np.array(data, dtype=object)
        if len(data):
            corpus = Corpus(domain,
                            Y=category_data,
                            metas=data,
                            text_features=[domain.metas[2]])

        return corpus

Exemple #5

0

Afficher le fichier

 def test_corpus_from_init(self):
     c = Corpus.from_file('book-excerpts')
     with self.assertWarns(FutureWarning):
         c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.W, c.text_features)
     np.testing.assert_array_equal(c.X, c2.X)
     np.testing.assert_array_equal(c.metas, c2.metas)
     self.assertEqual(c.documents, c2.documents)

Exemple #6

0

Afficher le fichier

Fichier : nyt.py Projet : david-novak/orange3-text

def _generate_corpus(records, required_text_fields):
    """
    Generates a corpus from the input NYT records.
    :param records: The input NYT records.
    :type records: list
    :param required_text_fields: A list of the available NYT text fields.
    :type required_text_fields: list
    :return: :class: `orangecontrib.text.corpus.Corpus`
    """
    metas, class_values = _parse_record_json(records, required_text_fields)

    # Create domain.
    meta_vars = [StringVariable.make(field) for field in required_text_fields]
    meta_vars += [
        StringVariable.make("pub_date"),
        StringVariable.make("country")
    ]
    class_vars = [
        DiscreteVariable("section_name", values=list(set(class_values)))
    ]
    domain = Domain([], class_vars=class_vars, metas=meta_vars)

    Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

    return Corpus(None, Y, metas, domain, meta_vars)  # used all features

Exemple #7

0

Afficher le fichier

def main():
    from Orange.data import Table, Domain, ContinuousVariable, StringVariable

    words = 'hey~mr. tallyman tally~me banana daylight come and me wanna go home'
    words = np.array([w.replace('~', ' ') for w in words.split()],
                     dtype=object,
                     ndmin=2).T
    weights = np.random.random((len(words), 1))

    data = np.zeros((len(words), 0))
    metas = []
    for i, w in enumerate(weights.T):
        data = np.column_stack((data, words, w))
        metas = metas + [
            StringVariable('Topic' + str(i)),
            ContinuousVariable('weights')
        ]
    domain = Domain([], metas=metas)
    table = Table.from_numpy(domain, X=np.zeros((len(words), 0)), metas=data)
    app = QApplication([''])
    w = OWWordCloud()
    w.on_topic_change(table)
    domain = Domain([], metas=[StringVariable('text')])
    data = Corpus(domain=domain, metas=np.array([[' '.join(words.flat)]]))
    # data = Corpus.from_numpy(domain, X=np.zeros((1, 0)), metas=np.array([[' '.join(words.flat)]]))
    w.on_corpus_change(data)
    w.handleNewSignals()
    w.show()
    app.exec()

Exemple #8

0

Afficher le fichier

    def handle_languages(self):
        if self.corpus is not None:
            domain = self.corpus.domain
            if self.detect_languages:
                if self.corpus.languages is None:
                    self.corpus.detect_languages()

                curr_attributes = list(domain.attributes)
                curr_class_var = [domain.class_var] if domain.class_var else []
                curr_metas = list(domain.metas)
                curr_variables = curr_attributes + curr_class_var + curr_metas
                curr_names = [var.name for var in curr_variables]
                new_name = get_unique_names(curr_names, "Language")

                variable_attrs = {'language-feature': True}
                new_variable = StringVariable(new_name)
                new_variable.attributes.update(variable_attrs)
                new_domain = Domain(attributes=domain.attributes,
                                    class_vars=domain.class_var,
                                    metas=list(domain.metas) + [new_variable])
                metas = np.hstack([
                    self.corpus.metas,
                    np.array(self.corpus.languages).reshape(-1, 1)
                ])
                self.corpus = Corpus(new_domain, self.corpus.X.copy(),
                                     self.corpus.Y.copy(), metas,
                                     self.corpus.W.copy(),
                                     copy(self.corpus.text_features))
            else:
                lang_feat_idx = None
                for i, f in enumerate(domain.metas):
                    if ('language-feature' in f.attributes
                            and f.attributes['language-feature']):
                        lang_feat_idx = i
                        break
                if lang_feat_idx is not None:
                    new_domain = Domain(attributes=domain.attributes,
                                        class_vars=domain.class_var,
                                        metas=list(
                                            np.delete(list(domain.metas),
                                                      lang_feat_idx)))
                    self.corpus = Corpus(
                        new_domain, self.corpus.X.copy(), self.corpus.Y.copy(),
                        np.delete(self.corpus.metas, lang_feat_idx, axis=1),
                        self.corpus.W.copy(), copy(self.corpus.text_features))
        self.Outputs.corpus.send(self.corpus)

Exemple #9

0

Afficher le fichier

Fichier : test_corpus.py Projet : szzyiit/orange3-text

    def test_init_preserve_shape_of_empty_x(self):
        c = Corpus.from_file('book-excerpts')
        d = c.domain
        new_domain = Domain((ContinuousVariable('c1'),), d.class_vars, d.metas)

        empty_X = csr_matrix((len(c), 1))
        new = Corpus(new_domain, X=empty_X, Y=c.Y, metas=c.metas)

        self.assertEqual(empty_X.nnz, 0)
        self.assertEqual(new.X.shape, empty_X.shape)

Exemple #10

0

Afficher le fichier

 def get_data(self):
     domain = Domain([], metas=[StringVariable("Conc. {}".format(
         self.word)), StringVariable("Document")])
     data = []
     docs = []
     for row in range(self.rowCount()):
         txt = []
         for column in range(self.columnCount()):
             index = self.index(row, column)
             txt.append(str(self.data(index)))
         data.append([" ".join(txt)])
         docs.append([self.corpus.titles[self.word_index[row][0]]])
     conc = np.array(np.hstack((data, docs)), dtype=object)
     return Corpus(domain, metas=conc, text_features=[domain.metas[0]])

Exemple #11

0

Afficher le fichier

 def _create_corpus(self) -> Corpus:
     corpus = None
     names = ["name", "path", "content"] if not self.is_conllu else [
         "name", "path", "utterance", "content"
     ]
     data = []
     category_data = []
     text_categories = list(set(t.category for t in self._text_data))
     values = list(set(text_categories))
     category_var = DiscreteVariable.make("category", values=values)
     for textdata in self._text_data:
         datum = [
             # some characters are written as decomposed (č is char c
             # and separate char for caron), with NFC normalization we
             # normalize them to be written as precomposed (č is one
             # unicode char - 0x10D)
             # https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
             normalize('NFC', textdata.name),
             normalize('NFC', textdata.path),
             normalize('NFC', textdata.content)
         ]
         if self.is_conllu:
             datum.insert(2, normalize('NFC', textdata.doc_id))
         data.append(datum)
         category_data.append(category_var.to_val(textdata.category))
     if len(text_categories) > 1:
         category_data = np.array(category_data)
     else:
         category_var = []
         category_data = np.empty((len(data), 0))
     domain = Domain([], category_var,
                     [StringVariable.make(name) for name in names])
     domain["name"].attributes["title"] = True
     data = np.array(data, dtype=object)
     if len(data):
         corpus = Corpus(domain,
                         Y=category_data,
                         metas=data,
                         text_features=[domain.metas[-1]])
     return corpus

Exemple #12

0

Afficher le fichier

Fichier : test_corpus.py Projet : webisaac/orange3-text

    def test_titles(self):
        c = Corpus.from_file('book-excerpts')

        # no title feature set
        titles = c.titles
        self.assertEqual(len(titles), len(c))
        for title in titles:
            self.assertIn('Document ', title)

        # inferred title from heuristics
        expected = list(map(str, range(len(c))))
        c2 = Corpus(Domain([], [], (StringVariable('heading'), )), None, None,
                    np.c_[expected])
        titles = c2.titles
        self.assertEqual(titles, expected)

        # title feature set
        c.domain[0].attributes['title'] = True
        titles = c.titles
        self.assertEqual(len(titles), len(c))
        for title in titles:
            self.assertIn(title, c.domain.class_var.values)

Exemple #13

0

Afficher le fichier

Fichier : pubmed.py Projet : natnaov8/orange3-text

def _corpus_from_records(records, includes_metadata):
    """Receives PubMed records and transforms them into a corpus.

    Args:
        records (list): A list of PubMed entries.
        includes_metadata (list): A list of text fields to include.

    Returns:
        corpus: The output Corpus.
    """
    meta_vars = []
    time_var = None
    for field_name, _ in includes_metadata:
        if field_name == PUBMED_FIELD_DATE:
            time_var = TimeVariable(field_name)
            meta_vars.append(time_var)
        else:
            meta_vars.append(StringVariable.make(field_name))
            if field_name == PUBMED_FIELD_TITLE:
                meta_vars[-1].attributes["title"] = True

    meta_values, class_values = _records_to_corpus_entries(
        records,
        includes_metadata=includes_metadata,
        time_var=time_var,
    )

    class_vars = [
        DiscreteVariable('section',
                         values=list(map(str, set(filter(None,
                                                         class_values)))))
    ]
    domain = Domain([], class_vars=class_vars, metas=meta_vars)

    Y = np.array([class_vars[0].to_val(cv) for cv in class_values])[:, None]

    return Corpus(domain=domain, Y=Y, metas=meta_values)

Exemple #14

0

Afficher le fichier

    def run_initial_query(self):
        self.warning(1)
        self.error(1)
        # Only execute if the NYT object is present(safety lock).
        # Otherwise this method cannot be called anyway.
        if self.nyt_api:
            # Query keywords.
            qkw = self.query_combo.currentText()

            # Text fields.
            text_includes_params = [
                self.includes_headline, self.includes_lead_paragraph,
                self.includes_snippet, self.includes_abstract,
                self.includes_keywords
            ]

            if True not in text_includes_params:
                self.warning(1, "You must select at least one text field.")
                return

            # Set the query url.
            self.nyt_api.set_query_url(qkw, self.year_from, self.year_to,
                                       text_includes_params)

            # Execute the query.
            res, cached, error = self.nyt_api.execute_query(0)

            if res:
                # Construct a corpus for the output.
                documents, metas, meta_vars, class_values = parse_record_json(
                    res, text_includes_params)
                class_vars = [
                    DiscreteVariable("section_name",
                                     values=list(set(class_values)))
                ]
                Y = np.array([class_vars[0].to_val(cv)
                              for cv in class_values])[:, None]
                Y[np.isnan(Y)] = 0
                domain = Domain([], class_vars=class_vars, metas=meta_vars)

                self.output_corpus = Corpus(documents, None, Y, metas, domain)
                self.send(Output.CORPUS, self.output_corpus)

                # Update the response info.
                self.all_hits = res["response"]["meta"]["hits"]
                self.num_retrieved = len(res["response"]["docs"])
                info_label = "Records: {}\nRetrieved: {}".format(
                    self.all_hits, self.num_retrieved)
                if self.all_hits > 1000:
                    info_label += " (max 1000)"
                self.query_info_label.setText(info_label)

                # Enable 'retrieve remaining' button.
                if self.num_retrieved < min(self.all_hits, 1000):
                    self.retrieve_other_button.setText(
                        'Retrieve remaining records ({})'.format(
                            min(self.all_hits, 1000) - self.num_retrieved))
                    self.retrieve_other_button.setEnabled(True)
                    self.retrieve_other_button.setFocus()
                else:
                    self.retrieve_other_button.setText('All records retrieved')
                    self.retrieve_other_button.setEnabled(False)

                # Add the query to history.
                if qkw not in self.recent_queries:
                    self.recent_queries.insert(0, qkw)
            else:
                if error:
                    if isinstance(error, HTTPError):
                        self.error(
                            1, "An error occurred(HTTP {})".format(error.code))
                    elif isinstance(error, URLError):
                        self.error(
                            1,
                            "An error occurred(URL {})".format(error.reason))

Exemple #15

0

Afficher le fichier

Fichier : test_corpus.py Projet : webisaac/orange3-text

 def test_corpus_from_init(self):
     c = Corpus.from_file('book-excerpts')
     c2 = Corpus(c.domain, c.X, c.Y, c.metas, c.text_features)
     self.assertEqual(c, c2)

Exemple #16

0

Afficher le fichier

Fichier : test_bowvectorizer.py Projet : larazupan/orange3-text

class BowVectorizationTest(unittest.TestCase):
    def test_transform(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        self.assertIsInstance(result, Corpus)
        self.assertEqual(len(result.domain.variables), 43)

    def test_binary(self):
        vect = BowVectorizer(wlocal=BowVectorizer.BINARY)
        corpus = Corpus.from_file('deerwester')
        result = vect.transform(corpus)
        self.assertEqual(result.X.max(), 1.)

    def test_empty_tokens(self):
        corpus = Corpus.from_file('deerwester')
        corpus.text_features = []
        bag_of_words = BowVectorizer().transform(corpus, copy=False)

        self.assertIs(corpus, bag_of_words)

    def test_domain(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')

        result = vect.transform(corpus)
        attrs = [attr.name for attr in result.domain.attributes]
        self.assertEqual(attrs, sorted(attrs))

        X = result.X.toarray()
        for i in range(len(corpus)):
            for contains, attr in zip(X[i], attrs):
                if contains > .001:
                    self.assertIn(attr, corpus.tokens[i])

    def test_ngrams(self):
        vect = BowVectorizer()
        corpus = Corpus.from_file('deerwester')
        corpus = preprocess.RegexpTokenizer('\w+')(corpus)
        corpus = preprocess.NGrams(ngrams_range=(1, 3))(corpus)
        result = vect.transform(corpus)
        attrs = [attr.name for attr in result.domain.attributes]
        self.assertIn(corpus.tokens[0][1], attrs)
        self.assertIn(' '.join(corpus.tokens[0][:2]), attrs)
        self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)

    def test_report(self):
        vect = BowVectorizer()
        self.assertGreater(len(vect.report()), 0)

    def test_args(self):
        corpus = Corpus.from_file('deerwester')

        BowVectorizer.wglobals['const'] = lambda df, N: 1

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')

        self.assertEqualCorpus(
            vect.transform(corpus),
            BowVectorizer(wlocal=BowVectorizer.COUNT).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.NONE,
                             wlocal=BowVectorizer.BINARY,
                             wglobal='const')
        self.assertEqualCorpus(
            vect.transform(corpus),
            BowVectorizer(wlocal=BowVectorizer.BINARY).transform(corpus))

        vect = BowVectorizer(norm=BowVectorizer.L1,
                             wlocal=BowVectorizer.COUNT,
                             wglobal='const')
        x = vect.transform(corpus).X
        self.assertAlmostEqual(abs(x.sum(axis=1) - 1).sum(), 0)

    def test_compute_values(self):
        corpus = Corpus.from_file('deerwester')
        vect = BowVectorizer()

        bow = vect.transform(corpus)
        computed = Corpus.from_table(bow.domain, corpus)

        self.assertEqual(bow.domain, computed.domain)
        self.assertEqual((bow.X != computed.X).nnz, 0)

    def test_compute_values_to_different_domain(self):
        source = Corpus.from_file('deerwester')
        destination = Corpus.from_file('book-excerpts')

        self.assertFalse(source.domain.attributes)
        self.assertFalse(destination.domain.attributes)

        bow = BowVectorizer().transform(source)
        computed = destination.transform(bow.domain)

        self.assertEqual(bow.domain.attributes, computed.domain.attributes)

    def assertEqualCorpus(self, first, second, msg=None):
        np.testing.assert_allclose(first.X.todense(),
                                   second.X.todense(),
                                   err_msg=msg)

    def test_empty_corpus(self):
        """
        Empty data.
        GH-247
        """
        corpus = Corpus.from_file("deerwester")[:0]
        vect = BowVectorizer(norm=BowVectorizer.L1)
        out = vect.transform(corpus)
        self.assertEqual(out, corpus)

    def tests_duplicated_names(self):
        """
        BOW adds words to the domain and if same attribute name already appear
        in the domain it renames it and add number to the existing attribute
        name
        """
        corpus = Corpus.from_file("deerwester")
        corpus = corpus.extend_attributes(np.ones((len(corpus), 1)), ["human"])
        corpus = corpus.extend_attributes(np.ones((len(corpus), 1)),
                                          ["testtest"])
        vect = BowVectorizer()
        out = vect.transform(corpus)
        # first attribute is in the dataset before bow and should be renamed
        self.assertEqual("human (1)", out.domain[0].name)
        self.assertEqual("testtest", out.domain[1].name)
        # all attributes from [1:] are are bow attributes and should include
        # human
        self.assertIn("human", [v.name for v in out.domain.attributes[1:]])

    def test_compute_values_same_tfidf_regardless_num_documents(self):
        """
        When computing TF-IDF from compute values TF-IDF should give same
        results regardless of length of new corpus - IDF weighting should consider
        only counts from original corpus.
        """
        corpus = Corpus.from_file('deerwester')
        train_corpus = corpus[:5]
        test_corpus = corpus[5:]
        vect = BowVectorizer(wglobal=BowVectorizer.IDF)

        bow = vect.transform(train_corpus)
        computed1 = Corpus.from_table(bow.domain, test_corpus[1:])
        computed2 = Corpus.from_table(bow.domain, test_corpus)

        self.assertEqual(computed1.domain, computed2.domain)
        self.assertEqual(bow.domain, computed2.domain)
        self.assertEqual((computed1.X != computed2.X[1:]).nnz, 0)

    # fmt: off
    domain = Domain([], metas=[StringVariable("text")])
    small_corpus_train = Corpus(domain,
                                np.empty((4, 0)),
                                metas=np.array([["this is a nice day day"],
                                                ["the day is nice"],
                                                ["i love a beautiful day"],
                                                ["this apple is mine"]]))
    terms = [
        "this", "is", "a", "nice", "day", "the", "i", "love", "beautiful",
        "apple", "mine"
    ]
    train_counts = np.array([[1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0],
                             [0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0],
                             [0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0],
                             [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]])
    small_corpus_test = Corpus(domain,
                               np.empty((3, 0)),
                               metas=np.array([
                                   ["this is a nice day day"],
                                   ["day nice summer mine"],
                                   ["apple is cool"],
                               ]))
    test_counts = np.array([[1, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0],
                            [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1],
                            [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0]])

    # fmt: on

    def assert_bow_same(self, corpus, values, terms):
        self.assertSetEqual(set(terms),
                            set(a.name for a in corpus.domain.attributes))
        for i, a in enumerate(terms):
            self.assertListEqual(
                corpus.get_column_view(a)[0].tolist(),
                values[:, i].tolist(),
                f"BOW differ for term {a}",
            )

    def test_count_correctness(self):
        """Test if computed counts are correct for train and test dataset"""
        bow = BowVectorizer().transform(self.small_corpus_train)
        self.assert_bow_same(bow, self.train_counts, self.terms)

        # computed from compute_values - result contains only terms from train dataset
        bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
        self.assert_bow_same(bow_test, self.test_counts, self.terms)

    def test_tfidf_correctness(self):
        """
        Test if computed tf-ids are correct for train and test dataset
        When computing tf-idf on the training dataset (from compute values)
        weights (idf) must be computed based on numbers on training dataset
        """
        bow = BowVectorizer(wglobal=BowVectorizer.IDF).transform(
            self.small_corpus_train)

        document_appearance = (self.train_counts != 0).sum(0)
        n = len(self.train_counts)
        idfs_train = self.train_counts * np.log(n / document_appearance)
        self.assert_bow_same(bow, idfs_train, self.terms)

        bow_test = Corpus.from_table(bow.domain, self.small_corpus_test)
        # weights computed based on numbers from training dataset
        idfs_test = self.test_counts * np.log(n / document_appearance)
        self.assert_bow_same(bow_test, idfs_test, self.terms)

Exemple #17

0

Afficher le fichier

Fichier : remove_low_tfidf_values.py Projet : phillette/orange-scripts

from orangecontrib.text.corpus import Corpus

CUT_VALUE = 10
"""
Remove columns that do not have any value above CUT_VALUE
"""
print("Num values in original data:", len(in_data.X.data))
print("Num attributes in original data:", len(in_data.domain.attributes))

column_max = in_data.X.max(axis=0).toarray().flatten()
attributes_mask = column_max > CUT_VALUE

out_data = Corpus(Domain(
    [a for a, inc in zip(in_data.domain.attributes, attributes_mask) if inc],
    in_data.domain.class_var, in_data.domain.metas),
                  in_data.X[:, attributes_mask],
                  Y=in_data.Y,
                  metas=in_data.metas,
                  text_features=in_data.text_features)

print("Num values after removing columns:", len(out_data.X.data))
"""
This part is optional:
Remove values that are not above CUT_VALUE
"""
cx = out_data.X.tocoo()
for i, j, v in zip(cx.row, cx.col, cx.data):
    if v <= CUT_VALUE:
        out_data.X[i, j] = 0
out_data.X.eliminate_zeros()

Exemple #18

0

Afficher le fichier

    def sendData(self):
        """Convert input(s) and send output"""
        if not (self.segmentation or self.corpus):
            self.infoBox.setText(u'Widget needs input.', 'warning')
            self.send('Textable segmentation', None, self)
            self.send('Text Mining corpus', None)
            return

        msg_seg = msg_corpus = ""

        num_iterations = 0
        if self.corpus:
            num_iterations += len(self.corpus)
        if self.segmentation:
            num_iterations += len(self.segmentation)
        self.infoBox.setText(u"Processing, please wait...", "warning")
        self.controlArea.setDisabled(True)
        progressBar = ProgressBar(self, iterations=num_iterations)

        # Convert corpus to segmentation...
        if self.corpus:
            self.clearCreatedInputs()
            new_segments = list()
            text_feature = self.corpus.text_features[self.segmentContent]
            for row in self.corpus:
                content = row[text_feature].value
                if content == "":
                    continue
                new_input = Input(row[text_feature].value)
                new_segment_annotations = dict()
                for attr in self.corpus.domain:
                    attr_str = str(row[attr])
                    if attr_str != "?":
                        new_segment_annotations[str(attr)] = attr_str
                for meta_attr in self.corpus.domain.metas:
                    meta_attr_str = str(row[meta_attr])
                    if (meta_attr != text_feature and meta_attr_str != "?"):
                        new_segment_annotations[str(meta_attr)] = meta_attr_str
                new_segments.append(
                    Segment(new_input[0].str_index, new_input[0].start,
                            new_input[0].end, new_segment_annotations))
                self.createdInputs.append(new_input)
                progressBar.advance()
            new_segmentation = Segmentation(new_segments, self.captionTitle)
            msg_seg = u'%i segment@p' % len(new_segmentation)
            msg_seg = pluralize(msg_seg, len(new_segmentation))
            self.send('Textable segmentation', new_segmentation, self)
        else:
            self.send('Textable segmentation', None, self)

        # Convert segmentation to corpus...
        if self.segmentation:
            metas = list()
            attributes = list()
            meta_keys = list()
            attribute_keys = list()
            for key in self.segmentation.get_annotation_keys():
                possible_values = set()
                for segment in self.segmentation:
                    try:
                        possible_values.add(str(segment.annotations[key]))
                    except KeyError:
                        pass
                if (self.limitNumCategories
                        and len(possible_values) > self.maxNumCategories):
                    metas.append(StringVariable(key))
                    meta_keys.append(key)
                else:
                    attributes.append(
                        DiscreteVariable(key, values=list(possible_values)))
                    attribute_keys.append(key)
            metas.append(StringVariable("textable_text"))
            domain = Domain(attributes, [], metas)
            rows = list()
            for segment in self.segmentation:
                row = [
                    str(segment.annotations.get(annotation_key, None))
                    for annotation_key in attribute_keys
                ]
                row.extend([
                    str(segment.annotations.get(annotation_key, None))
                    for annotation_key in meta_keys
                ])
                row.append(segment.get_content())
                rows.append(row)
                progressBar.advance
            table = Table(domain, rows)
            if textMiningIsInstalled:
                corpus = Corpus(domain,
                                X=table.X,
                                metas=table.metas,
                                text_features=[metas[-1]])
            msg_corpus = u'%i document@p' % len(self.segmentation)
            msg_corpus = pluralize(msg_corpus, len(self.segmentation))
            self.send('Text Mining corpus', corpus)
        else:
            self.send('Text Mining corpus', None)

        progressBar.finish()
        self.controlArea.setDisabled(False)

        if msg_seg or msg_corpus:
            message = msg_seg
            if msg_seg and msg_corpus:
                message += " and "
            message += msg_corpus
            message += " sent to output."
            self.infoBox.setText(message)

        self.sendButton.resetSettingsChangedFlag()