Beispiel #1
0
    def test_colors(self):
        var = DiscreteVariable.make("a", values=["F", "M"])
        self.assertIsNone(var._colors)
        self.assertEqual(var.colors.shape, (2, 3))
        self.assertIs(var._colors, var.colors)
        self.assertEqual(var.colors.shape, (2, 3))
        self.assertFalse(var.colors.flags.writeable)

        var.colors = np.arange(6).reshape((2, 3))
        np.testing.assert_almost_equal(var.colors, [[0, 1, 2], [3, 4, 5]])
        self.assertFalse(var.colors.flags.writeable)
        with self.assertRaises(ValueError):
            var.colors[0] = [42, 41, 40]
        var.set_color(0, [42, 41, 40])
        np.testing.assert_almost_equal(var.colors, [[42, 41, 40], [3, 4, 5]])

        var = DiscreteVariable.make("x", values=["A", "B"])
        var.attributes["colors"] = ['#0a0b0c', '#0d0e0f']
        np.testing.assert_almost_equal(var.colors, [[10, 11, 12], [13, 14, 15]])

        # Test ncolors adapts to nvalues
        var = DiscreteVariable.make('foo', values=['d', 'r'])
        self.assertEqual(len(var.colors), 2)
        var.add_value('e')
        self.assertEqual(len(var.colors), 3)
        user_defined = (0, 0, 0)
        var.set_color(2, user_defined)
        var.add_value('k')
        self.assertEqual(len(var.colors), 4)
        np.testing.assert_array_equal(var.colors[2], user_defined)
Beispiel #2
0
    def test_repr(self):
        var = DiscreteVariable.make("a", values=["F", "M"])
        self.assertEqual(
            repr(var),
            "DiscreteVariable(name='a', values=['F', 'M'])")
        var.ordered = True
        self.assertEqual(
            repr(var),
            "DiscreteVariable(name='a', values=['F', 'M'], ordered=True)")

        var = DiscreteVariable.make("a", values="1234567")
        self.assertEqual(
            repr(var),
            "DiscreteVariable(name='a', values=['1', '2', '3', '4', '5', '6', '7'])")
    def _create_corpus(self):
        corpus = None
        names = ["name", "path", "content"]
        data = []
        category_data = []
        text_categories = list(set(t.category for t in self._text_data))
        values = list(set(text_categories))
        category_var = DiscreteVariable.make("category", values=values)
        for textdata in self._text_data:
            data.append(
                [textdata.name,
                 textdata.path,
                 textdata.content]
            )
            category_data.append(category_var.to_val(textdata.category))
        if len(text_categories) > 1:
            category_data = np.array(category_data)
        else:
            category_var = []
            category_data = np.empty((len(data), 0))
        domain = Domain(
            [], category_var, [StringVariable.make(name) for name in names]
        )
        domain["name"].attributes["title"] = True
        data = np.array(data, dtype=object)
        if len(data):
            corpus = Corpus(domain,
                            Y=category_data,
                            metas=data,
                            text_features=[domain.metas[2]])

        return corpus
Beispiel #4
0
    def test_repr(self):
        var = DiscreteVariable.make("a", values=["F", "M"])
        self.assertEqual(
            repr(var),
            "DiscreteVariable('a', values=['F', 'M'])")
        var.base_value = 1
        self.assertEqual(
            repr(var),
            "DiscreteVariable('a', values=['F', 'M'], base_value=1)")
        var.ordered = True
        self.assertEqual(
            repr(var),
            "DiscreteVariable('a', values=['F', 'M'], "
            "ordered=True, base_value=1)")

        var = DiscreteVariable.make("a", values="1234567")
        self.assertEqual(
            repr(var),
            "DiscreteVariable('a', values=['1', '2', '3', '4', '5', ...])")
Beispiel #5
0
    def test_colors(self):
        var = DiscreteVariable.make("a", values=["F", "M"])
        self.assertIsNone(var._colors)
        self.assertEqual(var.colors.shape, (2, 3))
        self.assertIs(var._colors, var.colors)
        self.assertEqual(var.colors.shape, (2, 3))
        self.assertFalse(var.colors.flags.writeable)

        var.colors = np.arange(6).reshape((2, 3))
        np.testing.assert_almost_equal(var.colors, [[0, 1, 2], [3, 4, 5]])
        self.assertFalse(var.colors.flags.writeable)
        with self.assertRaises(ValueError):
            var.colors[0] = [42, 41, 40]
        var.set_color(0, [42, 41, 40])
        np.testing.assert_almost_equal(var.colors, [[42, 41, 40], [3, 4, 5]])

        var = DiscreteVariable.make("x", values=["A", "B"])
        var.attributes["colors"] = ['#0a0b0c', '#0d0e0f']
        np.testing.assert_almost_equal(var.colors, [[10, 11, 12], [13, 14, 15]])
Beispiel #6
0
 def test_val_from_str_add(self):
     var = DiscreteVariable.make("a", values=("F", "M"))
     self.assertTrue(math.isnan(var.val_from_str_add(None)))
     self.assertEqual(var.val_from_str_add("M"), 1)
     self.assertEqual(var.val_from_str_add("F"), 0)
     self.assertEqual(var.values, ("F", "M"))
     self.assertEqual(var.val_from_str_add("N"), 2)
     self.assertEqual(var.values, ("F", "M", "N"))
     self.assertEqual(var._value_index, {"F": 0, "M": 1, "N": 2})
     self.assertEqual(var.val_from_str_add("M"), 1)
     self.assertEqual(var.val_from_str_add("F"), 0)
     self.assertEqual(var.val_from_str_add("N"), 2)
Beispiel #7
0
    def _guess_variable(self, field_name, field_metadata, inspect_table):
        type_code = field_metadata[0]

        FLOATISH_TYPES = (700, 701, 1700)  # real, float8, numeric
        INT_TYPES = (20, 21, 23)  # bigint, int, smallint
        CHAR_TYPES = (25, 1042, 1043,)  # text, char, varchar
        BOOLEAN_TYPES = (16,)  # bool
        DATE_TYPES = (1082, 1114, 1184, )  # date, timestamp, timestamptz
        # time, timestamp, timestamptz, timetz
        TIME_TYPES = (1083, 1114, 1184, 1266,)

        if type_code in FLOATISH_TYPES:
            return ContinuousVariable.make(field_name)

        if type_code in TIME_TYPES + DATE_TYPES:
            tv = TimeVariable.make(field_name)
            tv.have_date |= type_code in DATE_TYPES
            tv.have_time |= type_code in TIME_TYPES
            return tv

        if type_code in INT_TYPES:  # bigint, int, smallint
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                if values:
                    return DiscreteVariable.make(field_name, values)
            return ContinuousVariable.make(field_name)

        if type_code in BOOLEAN_TYPES:
            return DiscreteVariable.make(field_name, ['false', 'true'])

        if type_code in CHAR_TYPES:
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                # remove trailing spaces
                values = [v.rstrip() for v in values]
                if values:
                    return DiscreteVariable.make(field_name, values)

        return StringVariable.make(field_name)
Beispiel #8
0
    def _guess_variable(self, field_name, field_metadata, inspect_table):
        type_code = field_metadata[0]

        FLOATISH_TYPES = (700, 701, 1700)  # real, float8, numeric
        INT_TYPES = (20, 21, 23)  # bigint, int, smallint
        CHAR_TYPES = (25, 1042, 1043,)  # text, char, varchar
        BOOLEAN_TYPES = (16,)  # bool
        DATE_TYPES = (1082, 1114, 1184, )  # date, timestamp, timestamptz
        # time, timestamp, timestamptz, timetz
        TIME_TYPES = (1083, 1114, 1184, 1266,)

        if type_code in FLOATISH_TYPES:
            return ContinuousVariable.make(field_name)

        if type_code in TIME_TYPES + DATE_TYPES:
            tv = TimeVariable.make(field_name)
            tv.have_date |= type_code in DATE_TYPES
            tv.have_time |= type_code in TIME_TYPES
            return tv

        if type_code in INT_TYPES:  # bigint, int, smallint
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                if values:
                    return DiscreteVariable.make(field_name, values)
            return ContinuousVariable.make(field_name)

        if type_code in BOOLEAN_TYPES:
            return DiscreteVariable.make(field_name, ['false', 'true'])

        if type_code in CHAR_TYPES:
            if inspect_table:
                values = self.get_distinct_values(field_name, inspect_table)
                # remove trailing spaces
                values = [v.rstrip() for v in values]
                if values:
                    return DiscreteVariable.make(field_name, values)

        return StringVariable.make(field_name)
Beispiel #9
0
    def create_contingency_table(self):
        """
        Create Orange.table from results

        Return
        --------
        o_model : Orange.Table
        """
        # create Orange.Table for calculated model
        dmn = [self.columns[self.genes[i]].name for i in self.column_order_]
        mts = DiscreteVariable.make(self.class_var.name, values=self.clusters_names)
        self.o_model = self.contingency_table(self.model, mts, dmn, np.array([self.row_order_]).T)
        return self.o_model
    def test_colors(self):
        var = DiscreteVariable.make("a", values=("F", "M"))
        self.assertIsNone(var._colors)
        self.assertEqual(var.colors.shape, (2, 3))
        self.assertFalse(var.colors.flags.writeable)

        var.colors = np.arange(6).reshape((2, 3))
        np.testing.assert_almost_equal(var.colors, [[0, 1, 2], [3, 4, 5]])
        self.assertEqual(var.attributes["colors"],
                         {"F": "#000102", "M": "#030405"})
        self.assertFalse(var.colors.flags.writeable)
        with self.assertRaises(ValueError):
            var.colors[0] = [42, 41, 40]

        var = DiscreteVariable.make("x", values=("A", "B"))
        var.attributes["colors"] = {"A": "#0a0b0c", "B": "#0d0e0f"}
        np.testing.assert_almost_equal(var.colors, [[10, 11, 12], [13, 14, 15]])

        # Backward compatibility with list-like attributes
        var = DiscreteVariable.make("x", values=("A", "B"))
        var.attributes["colors"] = ["#0a0b0c", "#0d0e0f"]
        np.testing.assert_almost_equal(var.colors, [[10, 11, 12], [13, 14, 15]])

        # Test ncolors adapts to nvalues
        var = DiscreteVariable.make('foo', values=('d', 'r'))
        self.assertEqual(len(var.colors), 2)
        var.add_value('e')
        self.assertEqual(len(var.colors), 3)
        var.add_value('k')
        self.assertEqual(len(var.colors), 4)

        # Missing colors are retrieved from palette
        var = DiscreteVariable.make("x", values=("A", "B", "C"))
        palette = LimitedDiscretePalette(3).palette
        var.attributes["colors"] = {"C": color_to_hex(palette[0]),
                                    "B": "#0D0E0F"}
        np.testing.assert_almost_equal(var.colors,
                                       [palette[1], [13, 14, 15], palette[0]])
Beispiel #11
0
    def __create_temp_class_var(self):
        """ See no evil !"""
        cluster_indicator_name = 'Cluster indicators'
        row_profile = None
        new_cluster_values = []
        var_index_lookup = {
            val: idx
            for var in self.cluster_indicators
            for idx, val in enumerate(var.values)
        }

        cart_prod = itertools.product(
            *[cluster.values for cluster in self.cluster_indicators])
        for comb in cart_prod:
            new_cluster_values.append(', '.join([val for val in comb]))
            self.new_cluster_profile.append(
                [var_index_lookup[val] for val in comb])

        row_profile_lookup = {
            tuple(profile): indx
            for indx, (profile, _) in enumerate(
                zip(self.new_cluster_profile, new_cluster_values))
        }
        for var in self.cluster_indicators:
            if row_profile is None:
                row_profile = np.asarray(
                    self.input_data.get_column_view(var)[0], dtype=int)
            else:
                row_profile = np.vstack(
                    (row_profile,
                     np.asarray(self.input_data.get_column_view(var)[0],
                                dtype=int)))

        ca_ind = DiscreteVariable.make(
            cluster_indicator_name,
            values=[val for val in new_cluster_values],
            ordered=True)

        domain = Domain(
            self.input_data.domain.attributes,
            self.input_data.domain.class_vars,
            self.input_data.domain.metas + (ca_ind, ),
        )

        table = self.input_data.transform(domain)
        table[:, ca_ind] = np.array(
            [[row_profile_lookup[tuple(row_profile[:, i])]]
             for i in range(row_profile.shape[1])])
        self.input_data = table
        return ca_ind
Beispiel #12
0
    def test_find_compatible_ordered(self):
        abc = DiscreteVariable("abc", values="abc", ordered=True)

        find_comp = DiscreteVariable._find_compatible

        self.assertIsNone(find_comp("abc"))
        self.assertIsNone(find_comp("abc", list("abc")))
        self.assertIs(find_comp("abc", ordered=True), abc)
        self.assertIs(find_comp("abc", ["a"], ordered=True), abc)
        self.assertIs(find_comp("abc", ["a", "b"], ordered=True), abc)
        self.assertIs(find_comp("abc", ["a", "b", "c"], ordered=True), abc)
        self.assertIs(find_comp("abc", ["a", "b", "c", "d"], ordered=True), abc)

        abd = DiscreteVariable.make(
            "abc", values=["a", "d", "b"], ordered=True)
        self.assertIsNot(abc, abd)

        abc_un = DiscreteVariable.make("abc", values=["a", "b", "c"])
        self.assertIsNot(abc_un, abc)

        self.assertIs(
            find_comp("abc", values=["a", "d", "b"], ordered=True), abd)
        self.assertIs(find_comp("abc", values=["a", "b", "c"]), abc_un)
Beispiel #13
0
    def test_find_compatible_ordered(self):
        abc = DiscreteVariable("abc", values="abc", ordered=True)

        find_comp = DiscreteVariable._find_compatible

        self.assertIsNone(find_comp("abc"))
        self.assertIsNone(find_comp("abc", list("abc")))
        self.assertIs(find_comp("abc", ordered=True), abc)
        self.assertIs(find_comp("abc", ["a"], ordered=True), abc)
        self.assertIs(find_comp("abc", ["a", "b"], ordered=True), abc)
        self.assertIs(find_comp("abc", ["a", "b", "c"], ordered=True), abc)
        self.assertIs(find_comp("abc", ["a", "b", "c", "d"], ordered=True), abc)

        abd = DiscreteVariable.make(
            "abc", values=["a", "d", "b"], ordered=True)
        self.assertIsNot(abc, abd)

        abc_un = DiscreteVariable.make("abc", values=["a", "b", "c"])
        self.assertIsNot(abc_un, abc)

        self.assertIs(
            find_comp("abc", values=["a", "d", "b"], ordered=True), abd)
        self.assertIs(find_comp("abc", values=["a", "b", "c"]), abc_un)
Beispiel #14
0
    def test_colors(self):
        var = DiscreteVariable.make("a", values=["F", "M"])
        self.assertIsNone(var._colors)
        self.assertEqual(var.colors.shape, (2, 3))
        self.assertFalse(var.colors.flags.writeable)

        var.colors = np.arange(6).reshape((2, 3))
        np.testing.assert_almost_equal(var.colors, [[0, 1, 2], [3, 4, 5]])
        self.assertFalse(var.colors.flags.writeable)
        with self.assertRaises(ValueError):
            var.colors[0] = [42, 41, 40]

        var = DiscreteVariable.make("x", values=["A", "B"])
        var.attributes["colors"] = ['#0a0b0c', '#0d0e0f']
        np.testing.assert_almost_equal(var.colors,
                                       [[10, 11, 12], [13, 14, 15]])

        # Test ncolors adapts to nvalues
        var = DiscreteVariable.make('foo', values=['d', 'r'])
        self.assertEqual(len(var.colors), 2)
        var.add_value('e')
        self.assertEqual(len(var.colors), 3)
        var.add_value('k')
        self.assertEqual(len(var.colors), 4)
Beispiel #15
0
 def test_unpickle(self):
     d1 = DiscreteVariable("A", values=["two", "one"])
     s = pickle.dumps(d1)
     d2 = DiscreteVariable.make("A", values=["one", "two", "three"])
     d2_values = tuple(d2.values)
     d1c = pickle.loads(s)
     # See: gh-3238
     # The unpickle reconstruction picks an existing variable (d2), on which
     # __setstate__ or __dict__.update is called
     self.assertSequenceEqual(d2.values, d2_values)
     self.assertSequenceEqual(d1c.values, d1.values)
     s = pickle.dumps(d2)
     DiscreteVariable._clear_all_caches()  # [comment redacted]
     d1 = DiscreteVariable("A", values=["one", "two"])
     d2 = pickle.loads(s)
     self.assertSequenceEqual(d2.values, ["two", "one", "three"])
Beispiel #16
0
 def test_unpickle(self):
     d1 = DiscreteVariable("A", values=("two", "one"))
     s = pickle.dumps(d1)
     d2 = DiscreteVariable.make("A", values=("one", "two", "three"))
     d2_values = tuple(d2.values)
     d1c = pickle.loads(s)
     # See: gh-3238
     # The unpickle reconstruction picks an existing variable (d2), on which
     # __setstate__ or __dict__.update is called
     self.assertSequenceEqual(d2.values, d2_values)
     self.assertSequenceEqual(d1c.values, d1.values)
     s = pickle.dumps(d2)
     d1 = DiscreteVariable("A", values=("one", "two"))
     d2 = pickle.loads(s)
     self.assertSequenceEqual(d2.values, ("one", "two", "three"))
     self.assertSequenceEqual(d1.values, ("one", "two"))
Beispiel #17
0
 def test_unpickle(self):
     d1 = DiscreteVariable("A", values=["two", "one"])
     s = pickle.dumps(d1)
     d2 = DiscreteVariable.make("A", values=["one", "two", "three"])
     d2_values = tuple(d2.values)
     d1c = pickle.loads(s)
     # See: gh-3238
     # The unpickle reconstruction picks an existing variable (d2), on which
     # __setstate__ or __dict__.update is called
     self.assertSequenceEqual(d2.values, d2_values)
     self.assertSequenceEqual(d1c.values, d1.values)
     s = pickle.dumps(d2)
     DiscreteVariable._clear_all_caches()  # [comment redacted]
     d1 = DiscreteVariable("A", values=["one", "two"])
     d2 = pickle.loads(s)
     self.assertSequenceEqual(d2.values, ["two", "one", "three"])
Beispiel #18
0
    def __init__(self):
        # check if db3x3.pickle available
        pickle_path = os.path.join(os.path.dirname(__file__), "db3x3.pickle")
        if not os.path.isfile(pickle_path):
            dtg = {}
            txt_path = os.path.join(os.path.dirname(__file__), "db3x3.txt")
            for l in open(txt_path):
                desc, nmoves = l.strip().split(",")
                desc = bytes(desc, encoding="utf8")
                dtg[desc] = float(nmoves)
                if dtg[desc] < 0:
                    dtg[desc] = -dtg[desc]
            pickle.dump(dtg, open(pickle_path, "wb"))
        self.dtg = pickle.load(open(pickle_path, "rb"))

        self.attributes = [DiscreteVariable.make("{}_{}".format(i, v),
                                                 values=["no", "yes"])
                           for i in range(9) for v in range(9)]
Beispiel #19
0
    def generate_dataset(self, n=3, cols=10, samples=1000):
        cluster_std = 2.5
        f = lambda x, i: .5 * x * i + 2 * i

        X, Y = datasets.make_blobs(n_samples=samples,
                                   n_features=cols,
                                   centers=1,
                                   cluster_std=cluster_std,
                                   random_state=10)
        for i in range(samples):
            y = i % n
            Y[i] = y
            if y > 0:
                X[i, cols // 2:] = f(X[i, cols // 2:], i)
        dom = Domain([ContinuousVariable.make(str(i)) for i in range(cols)],
                     DiscreteVariable.make('class',
                                           values=[str(i) for i in range(n)]))
        return Table.from_numpy(dom, X, Y[np.newaxis].T)
Beispiel #20
0
    def make_orange_table(self, dataset):
        """
        Make Orange table with raw audio data and send it to output

        :param dataset: input dataset
        :return: Void
        """

        X = []
        for i in dataset.metas:
            import os
            if os.path.isfile(i[1]):
                filename = i[1]
            else:
                filename = i[1].split(".wav")[0]

            framerate, data = read(filename)
            if len(data.shape) > 1:
                data = data[:, 0]
            X.append(data)

        X = self.make_square_array(numpy.array(X), max(dataset.metas[:, 2]))
        data_vars = [
            Orange.data.ContinuousVariable.make('n{:d}'.format(i))
            for i in range(len(X[0]))
        ]

        if dataset.Y != []:
            Y = DiscreteVariable.make("Category",
                                      values=dataset.domain.class_var.values,
                                      ordered=True)
        else:
            Y = None

        self.domain = Domain(attributes=data_vars,
                             class_vars=Y,
                             metas=dataset.domain.metas)

        orange_table = Table(self.domain, numpy.array(X), dataset.Y,
                             dataset.metas)
        self.send("Audio", orange_table)
Beispiel #21
0
 def _create_corpus(self) -> Corpus:
     corpus = None
     names = ["name", "path", "content"] if not self.is_conllu else [
         "name", "path", "utterance", "content"
     ]
     data = []
     category_data = []
     text_categories = list(set(t.category for t in self._text_data))
     values = list(set(text_categories))
     category_var = DiscreteVariable.make("category", values=values)
     for textdata in self._text_data:
         datum = [
             # some characters are written as decomposed (č is char c
             # and separate char for caron), with NFC normalization we
             # normalize them to be written as precomposed (č is one
             # unicode char - 0x10D)
             # https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize
             normalize('NFC', textdata.name),
             normalize('NFC', textdata.path),
             normalize('NFC', textdata.content)
         ]
         if self.is_conllu:
             datum.insert(2, normalize('NFC', textdata.doc_id))
         data.append(datum)
         category_data.append(category_var.to_val(textdata.category))
     if len(text_categories) > 1:
         category_data = np.array(category_data)
     else:
         category_var = []
         category_data = np.empty((len(data), 0))
     domain = Domain([], category_var,
                     [StringVariable.make(name) for name in names])
     domain["name"].attributes["title"] = True
     data = np.array(data, dtype=object)
     if len(data):
         corpus = Corpus.from_numpy(domain,
                                    X=np.empty((len(category_data), 0)),
                                    Y=category_data,
                                    metas=data,
                                    text_features=[domain.metas[-1]])
     return corpus
Beispiel #22
0
 def test_val_from_str(self):
     var = DiscreteVariable.make("a", values=["F", "M"])
     self.assertTrue(math.isnan(var.to_val(None)))
     self.assertEqual(var.to_val(1), 1)
Beispiel #23
0
 def test_make(self):
     var = DiscreteVariable.make("a", values=["F", "M"])
     self.assertIsInstance(var, DiscreteVariable)
     self.assertEqual(var.name, "a")
     self.assertEqual(var.values, ["F", "M"])
Beispiel #24
0
    def call_segmetation(self):
        """
        Segment all recordings and create new Orange table

        :return: Void
        """

        if self.data is None:
            print(self.window_size)
            return

        error = None

        try:
            segmentation = Segmentation()
            if self.window_size > max(self.data.metas[:, -2]):
                self.info.setStyleSheet(error_red)
                self.info.setText(
                    "Window size must be lower than largest sound clip!")
                return
            elif self.overlap > max(self.data.metas[:, -2]):
                self.info.setStyleSheet(error_red)
                self.info.setText(
                    "Overlap must be lower than largest sound clip!")
                return
            data = segmentation.segment_all(self.data, self.window_size,
                                            self.overlap, self.tmp_dir_id)

        except Exception as ex:
            error = ex

        if not error:

            self.info.setStyleSheet(success_green)
            self.info.setText("Segmentation successful")

            if data[0] != []:
                Y = DiscreteVariable.make(
                    "Target class",
                    values=self.data.domain.class_var.values,
                    ordered=True)
            else:
                Y = None

            segment_var = Orange.data.StringVariable("segment name")
            sound_var = Orange.data.StringVariable("sound")
            sound_var.attributes["type"] = "sound"
            size_var = Orange.data.ContinuousVariable("segment size",
                                                      number_of_decimals=0)
            length_var = Orange.data.ContinuousVariable("segment length",
                                                        number_of_decimals=2)
            framerate_var = Orange.data.ContinuousVariable(
                "segment framerate", number_of_decimals=0)

            domain = Orange.data.Domain(
                [], [Y] if Y is not None else [],
                [segment_var, sound_var, size_var, length_var, framerate_var])

            if len(data[0]):
                table = Orange.data.Table.from_numpy(
                    domain, numpy.empty((len(data[0]), 0), dtype=float),
                    data[0], data[1])
            else:
                table = None

            self.send("Segmentation", table)

        if error:
            self.info.setStyleSheet(error_red)
            self.info.setText("An error occurred:\n{}".format(error))
            return
Beispiel #25
0
 def test_val_from_str(self):
     var = DiscreteVariable.make("a", values=["F", "M"])
     self.assertTrue(math.isnan(var.to_val(None)))
     self.assertEqual(var.to_val(1), 1)
Beispiel #26
0
 def test_make(self):
     var = DiscreteVariable.make("a", values=["F", "M"])
     self.assertIsInstance(var, DiscreteVariable)
     self.assertEqual(var.name, "a")
     self.assertEqual(var.values, ["F", "M"])
Beispiel #27
0
import Orange
from Orange.data import Domain, DiscreteVariable, ContinuousVariable
import numpy as np
import os

act_filename_pattern = r"a:\IsKnown_Results\activations_prelast_clsf_from_isVisible_20210415_gpu1_{}.h5"
results_folder = r"a:\IsKnown_Results"

#set_name = "Test"
set_name = "Train"

# Load the activation in pickle format
act_filename = act_filename_pattern.format(set_name)
(act_prelast, lbls) = pickle.load(open(act_filename, 'rb'))

# convert to Orange table and save
#   Later used for:
#       a) Here - later in code - SOM clustering
domain = Domain([
    ContinuousVariable.make("Feat_" + str(i))
    for i in np.arange(act_prelast.shape[1])
], DiscreteVariable.make(name="lbls", values=np.unique(lbls.astype(str))))
#train_class_indices = np.asarray ( [subcategories.index(train_class) for train_class in train_classes] )
orange_tab = Orange.data.Table.from_numpy(domain=domain,
                                          X=act_prelast,
                                          Y=lbls.astype(str))

# Save Orange table files
activations_orangeTable_filename = os.path.join(
    results_folder, "{}_activations_preLast_Orange.tab".format(set_name))
orange_tab.save(activations_orangeTable_filename)
Beispiel #28
0
def association_without_TLS(myList=[],
                            min_supp=0.1,
                            confidence=0.1,
                            min_lift=1,
                            name=""):
    df = myList
    print("Total rows before : ", int(df.shape[0]))

    allowed_columns = [
        'Sites using trackers', 'Sites setting third party cookies',
        'Sites using Google Analytics',
        'Google Analytics privacy extension enabled',
        'Web & mail servers in same country',
        'Content Security Policy header set', 'X-Frame-Options header set',
        'Secure XSS Protection header set',
        'Secure X-Content-Type-Options header set',
        'Referrer Policy header set', 'Server offers HTTPS',
        'Mail server supports encryption', 'Unintentional information leaks'
    ]

    for column in df.columns:
        if column not in allowed_columns:
            df.drop([column], axis=1, inplace=True)
    print("Sum of missing values:")
    print(df.isnull().sum())

    input_assoc_rules = df
    print(df.columns)

    print("Total rows = ", int(df.shape[0]))
    print("Total columns = ", int(df.shape[1]))
    total_rows = int(df.shape[0])

    domain_checks = Domain([
        DiscreteVariable.make(name=check, values=['0', '1', '1000'])
        for check in input_assoc_rules.columns
    ])
    data_gro_1 = Orange.data.Table.from_numpy(domain=domain_checks,
                                              X=input_assoc_rules.as_matrix(),
                                              Y=None)
    data_gro_1_en, mapping = OneHot.encode(data_gro_1, include_class=False)
    min_support = float(min_supp)
    print("num of required transactions = ",
          int(input_assoc_rules.shape[0] * min_support))
    num_trans = input_assoc_rules.shape[0] * min_support
    itemsets = dict(frequent_itemsets(data_gro_1_en, min_support=min_support))
    print(len(itemsets))

    confidence = float(confidence)

    rules_df = pd.DataFrame()
    rules = [(P, Q, supp, conf)
             for P, Q, supp, conf in association_rules(itemsets, confidence)
             if len(Q) == 1 and Q]

    restricted_ante = [
        'Mail server supports encryption=1', 'Server offers HTTPS=1'
    ]

    print("Step 1: Rules generated")

    names = {
        item: '{}={}'.format(var.name, val)
        for item, var, val in OneHot.decode(mapping, data_gro_1, mapping)
    }

    print("Step 2: Decoded")

    eligible_ante = [v for k, v in names.items()]  #allowed both 0 and 1
    N = input_assoc_rules.shape[0]

    rule_stats = list(rules_stats(rules, itemsets, N))

    print("Step 3: Stats for rules generated")
    rule_list_df = []

    rule_list_df = []
    for ex_rule_frm_rule_stat in rule_stats:
        ante = ex_rule_frm_rule_stat[0]
        cons = ex_rule_frm_rule_stat[1]
        named_cons = names[next(iter(cons))]

        #named_cons = [names[i] for i in cons if names[i] in eligible_ante]
        #named_cons = ', '.join(named_cons)
        #if named_cons in eligible_ante:
        rule_lhs = [names[i] for i in ante if names[i] in eligible_ante]
        ante_rule = ', '.join(rule_lhs)
        if ante_rule and len(rule_lhs) > 1 and len(rule_lhs) < 5:
            rule_dict = {
                'support': ex_rule_frm_rule_stat[2],
                'confidence': ex_rule_frm_rule_stat[3],
                'lift': ex_rule_frm_rule_stat[6],
                'antecedent': ante_rule,
                'consequent': named_cons
            }
            rule_list_df.append(rule_dict)
    rules_df = pd.DataFrame(rule_list_df)

    print("{} association rules generated".format(rules_df.shape[0]))

    if not rules_df.empty:
        rules_df['support'] = rules_df['support'].apply(
            lambda x: x / total_rows)
        rules_df = rules_df[rules_df['lift'] >= float(min_lift)]
        rules_df = rules_df[[
            'antecedent', 'consequent', 'support', 'confidence', 'lift'
        ]].sort_values(['lift'], ascending=False)
        rules_df.to_csv(os.path.join('/home/sysop/',
                                     "ohne_tls_" + name + ".csv"),
                        sep='\t',
                        index=False)


#        print(rules_df.to_csv(sep=' ', index=False, header=False))
    else:
        print("Unable to generate any rule")
Beispiel #29
0
plt.title('Item sales distribution')
"""# Analyze items contributing to top sales"""

item_summary_df['item_perc'] = item_summary_df['item_id'] / total_item_id
item_summary_df['total_perc'] = item_summary_df.item_perc.cumsum()
item_summary_df.head(10)
"""# Analyze items contributing to top 50% of sales"""

item_summary_df[item_summary_df.total_perc <= 0.5].shape

item_summary_df[item_summary_df.total_perc <= 0.5]
"""# Construct Orange Table"""

input_assoc_rules = product_df
domain_product = Domain([
    DiscreteVariable.make(name=item, values=['0', '1'])
    for item in input_assoc_rules.columns
])
data_gro_1 = Orange.data.Table.from_numpy(domain=domain_product,
                                          X=input_assoc_rules.as_matrix(),
                                          Y=None)
"""# Prune Dataset for frequently purchased items"""


def prune_dataset(input_df,
                  length_trans=2,
                  total_sales_perc=0.5,
                  start_item=None,
                  end_item=None):
    if 'total_items' in input_df.columns:
        del (input_df['total_items'])
Beispiel #30
0
    def call_feature_extraction(self, id):
        """
        :param id: feature id
        :return: Void -> load data with extracted features to Orange table
        """
        if self.data is None:
            return

        error = None
        feature_extracted_data = []
        try:
            feature_extraction = FeatureExtraction()

            for i in range(len(self.data.metas)):
                if os.path.isfile(self.data.metas[i][1]):
                    filename = self.data.metas[i][1]
                else:
                    filename = self.data.metas[i][1].split(".wav")[0]
                if id == 0:
                    array = feature_extraction.extract_emobase_features(
                        filename)
                elif id == 1 or id == 2:
                    array = feature_extraction.extract_mfcc_plp_features(
                        filename, id)
                elif id == 3:
                    array = feature_extraction.extract_chroma_features(
                        filename)
                else:
                    framerate, data = read(filename)
                    array = feature_extraction.extract_all_mean_features(
                        data, framerate)

                feature_extracted_data.append(array)

            self.X = numpy.array(feature_extracted_data)

        except Exception as ex:
            error = ex

        if not error:
            self.info.setStyleSheet(success_green)
            self.info.setText("Features successful extracted!")

            dimensions = range(self.X.shape[1])
            attributes = [
                ContinuousVariable.make('Feature {:d}'.format(d))
                for d in dimensions
            ]

            if self.data.Y != []:
                Y = DiscreteVariable.make(
                    "Target class",
                    values=self.data.domain.class_var.values,
                    ordered=True)
            else:
                Y = None

            self.domain = Domain(attributes=attributes,
                                 class_vars=Y,
                                 metas=self.data.domain.metas)
            orange_table = Table(self.domain, self.X, self.data.Y,
                                 self.data.metas)

            self.send("Extracted features", orange_table)

        if error:
            self.info.setStyleSheet(error_red)
            self.info.setText("An error occurred:\n{}".format(error))
            return
Beispiel #31
0
def association(myList=[], min_supp=0.1, confidence=0.1, min_lift=1, name=""):
    df = myList
    mydict = []
    for group in DEFAULT_GROUP_ORDER:
        for check, data in CHECKS[group].items():
            mydict.append(data.get('short_title'))

    melted_data = pd.melt(df,
                          id_vars=['url'],
                          value_vars=mydict,
                          var_name='check',
                          value_name='value')

    melted_data = melted_data.replace(to_replace='None', value=np.nan).dropna()

    #melted_data.groupby(by=['check', 'value']).size().unstack().plot(kind='bar', stacked=True, figsize=(8,5))
    #plt.show()

    url_all = []
    sites = Site.objects.all()
    print(sites.count())
    for site in sites:
        url_parsed = urlparse(site.url)
        url_all.append(url_parsed.netloc)
    print(len(url_all))
    print(len(list(set(url_all))))

    print("Total rows before : ", int(df.shape[0]))

    df['missing_val'] = df.isnull().sum(axis=1)

    print("Average missing values in each transaction = ",
          float(np.ceil(df['missing_val'].mean())))
    df = df.drop('missing_val', axis=1)

    #df =df[(df['Server offers HTTPS'] == '1') & (df['Mail server supports encryption'] == '1')]

    restricted_columns = [
        'Sites setting first party cookies', 'Sites using third party embeds',
        'Google Analytics privacy extension enabled',
        'HTTP URL also reachable via HTTPS', 'HSTS header duration sufficient',
        'Server ready for HSTS preloading',
        'Web server Protected against Secure Renegotiation',
        'Included in Chrome HSTS preload list', 'Web server supports SSL 2.0',
        'Server offers HTTPS', 'Mail server supports encryption',
        'Mail server supports SSL 2.0',
        'Mail server Protected against Secure Renegotiation',
        'Mail server Protected against Heartbleed',
        'Web server Protected against Heartbleed',
        'Web server Protected against BEAST',
        'Mail server Protected against BEAST',
        'Mail server Protected against LOGJAM',
        'Web server Protected against LOGJAM',
        'Web server Protected against LUCKY13',
        'Mail server Protected against LUCKY13',
        'Mail server Protected against CRIME',
        'Web server Protected against CRIME',
        'Mail server Protected against CCS attack',
        'Web server Protected against CCS attack',
        'Mail server Protected against DROWN',
        'Web server Protected against DROWN',
        'Mail server Protected against FREAK',
        'Web server Protected against FREAK',
        'Mail server Protected against BREACH', 'Domain has Mail server',
        'Mail server Protected against Ticketbleed',
        'Web server Protected against Ticketbleed'
    ]

    for column in restricted_columns:
        if column in df.columns:
            df.drop([column], axis=1, inplace=True)
    print(df.isnull().sum())

    input_assoc_rules = df
    print(df.columns)

    print("Total rows = ", int(df.shape[0]))
    print("Total columns = ", int(df.shape[1]))
    total_rows = int(df.shape[0])

    domain_checks = Domain([
        DiscreteVariable.make(name=check, values=['0', '1', '1000'])
        for check in input_assoc_rules.columns
    ])
    data_gro_1 = Orange.data.Table.from_numpy(domain=domain_checks,
                                              X=input_assoc_rules.as_matrix(),
                                              Y=None)
    data_gro_1_en, mapping = OneHot.encode(data_gro_1, include_class=False)
    min_support = float(min_supp)
    print("num of required transactions = ",
          int(input_assoc_rules.shape[0] * min_support))
    num_trans = input_assoc_rules.shape[0] * min_support
    itemsets = dict(frequent_itemsets(data_gro_1_en, min_support=min_support))
    print(len(itemsets))

    confidence = float(confidence)

    rules_df = pd.DataFrame()
    rules = [(P, Q, supp, conf)
             for P, Q, supp, conf in association_rules(itemsets, confidence)
             if len(Q) == 1 and Q]

    restricted_ante = [
        'Mail server supports SSL 3.0=0', 'Web server supports SSL 3.0=0',
        'Web server supports TLS 1.1=1', 'Web server supports TLS 1.2=1',
        'Mail server supports TLS 1.1=1', 'Mail server supports TLS 1.2=1',
        'Mail server Protected against BREACH=1',
        'Mail server supports Legacy TLS 1.0=1',
        'Web server supports Legacy TLS 1.0'
    ]

    print("Step 1: Rules generated")

    names = {
        item: '{}={}'.format(var.name, val)
        for item, var, val in OneHot.decode(mapping, data_gro_1, mapping)
    }

    print("Step 2: Decoded")

    eligible_ante = [v for k, v in names.items()]  #allowed both 0 and 1
    N = input_assoc_rules.shape[0]
    print(N)
    rule_stats = list(rules_stats(rules, itemsets, N))

    print("Step 3: Stats for rules generated")
    rule_list_df = []

    rule_list_df = []
    for ex_rule_frm_rule_stat in rule_stats:
        ante = ex_rule_frm_rule_stat[0]
        cons = ex_rule_frm_rule_stat[1]
        named_cons = names[next(iter(cons))]

        #named_cons = [names[i] for i in cons if names[i] in eligible_ante]
        #named_cons = ', '.join(named_cons)
        #if named_cons in eligible_ante:
        rule_lhs = [names[i] for i in ante if names[i] in eligible_ante]
        ante_rule = ', '.join(rule_lhs)
        if ante_rule and len(rule_lhs) > 1 and len(rule_lhs) < 5:
            rule_dict = {
                'support': ex_rule_frm_rule_stat[2],
                'confidence': ex_rule_frm_rule_stat[3],
                'lift': ex_rule_frm_rule_stat[6],
                'antecedent': ante_rule,
                'consequent': named_cons
            }
            rule_list_df.append(rule_dict)
    rules_df = pd.DataFrame(rule_list_df)
    print("{} association rules generated".format(rules_df.shape[0]))

    if not rules_df.empty:
        rules_df['support'] = rules_df['support'].apply(
            lambda x: x / total_rows)
        rules_df = rules_df[rules_df['lift'] >= float(min_lift)]
        rules_df = rules_df[[
            'antecedent', 'consequent', 'support', 'confidence', 'lift'
        ]].sort_values(['lift'], ascending=False)
        rules_df.to_csv(os.path.join('/home/sysop/', "tls_" + name + ".csv"),
                        sep='\t',
                        index=False)


#        print(rules_df.to_csv(sep=' ', index=False, header=False))
    else:
        print("Unable to generate any rule")
Beispiel #32
0
    def __get_pivot_tab_domain(self, val_var, X, X_h, X_v, X_t, agg_funs):
        def map_values(index, _X):
            values = np.unique(_X[:, index])
            values = np.delete(values, np.where(values == "nan")[0])
            for j, value in enumerate(values):
                _X[:, index][_X[:, index] == value] = j
            return values

        create_time_var = \
            isinstance(val_var, TimeVariable) and \
            all(fun in self.TimeVarFunctions for fun in agg_funs)
        create_cont_var = \
            not val_var or val_var.is_continuous and \
            (not isinstance(val_var, TimeVariable) or
             all(fun in self.FloatFunctions for fun in agg_funs))

        vals = np.array(self._col_var.values)[self._col_var_groups.astype(int)]
        if create_time_var:
            kwargs = {
                "have_date": val_var.have_date,
                "have_time": val_var.have_time
            }
            attrs = [[TimeVariable(f"{v}", **kwargs) for v in vals]] * 2
            attrs.extend([[TimeVariable("Total", **kwargs)]] * 2)
        elif create_cont_var:
            attrs = [[ContinuousVariable(f"{v}", 1) for v in vals]] * 2
            attrs.extend([[ContinuousVariable("Total", 1)]] * 2)
        else:
            attrs = []
            for x in (X, X_h):
                attrs.append([
                    DiscreteVariable(f"{v}", map_values(i, x))
                    for i, v in enumerate(vals, 2)
                ])
            for x in (X_v, X_t):
                attrs.append([DiscreteVariable("Total", map_values(0, x))])
        row_var_h = DiscreteVariable(self._row_var.name, values=["Total"])
        aggr_attr = DiscreteVariable('Aggregate', [str(f) for f in agg_funs])

        same_row_col = self._col_var is self._row_var

        extra_vars = [self._row_var, aggr_attr]
        uniq_a = get_unique_names_duplicates([v.name for v in extra_vars] +
                                             [atr.name for atr in attrs[0]])
        for (idx, var), u in zip(enumerate(chain(extra_vars, attrs[0])),
                                 uniq_a):
            if var.name == u:
                continue
            if idx == 0:
                self.renamed.append(self._row_var.name)
                self._row_var = self._row_var.copy(name=u)
                if same_row_col:
                    self._col_var = self._row_var
                row_var_h = row_var_h.copy(name=u)
            elif idx == 1:
                self.renamed.append(aggr_attr.name)
                aggr_attr = aggr_attr.copy(name=u)
            else:
                self.renamed.append(var.name)
                attrs[0][idx - 2] = var.copy(name=u)
                attrs[1][idx - 2] = var.copy(name=u)

        if same_row_col:
            vals = tuple(v.name for v in attrs[0])
            self._row_var.make(self._row_var.name, values=vals)
            vals = tuple(v.name for v in attrs[2])
            row_var_h.make(row_var_h.name, vals)

        return (Domain([self._row_var, aggr_attr] + attrs[0]),
                Domain([row_var_h, aggr_attr] + attrs[1]), Domain(attrs[2]),
                Domain(attrs[3]))
item_summary_df[item_summary_df.total_perc <= 0.5].shape


# In[13]:


item_summary_df[item_summary_df.total_perc <= 0.5]


# # Construct Orange Table 

# In[16]:


input_assoc_rules = grocery_df
domain_grocery = Domain([DiscreteVariable.make(name=item,values=['0', '1']) for item in input_assoc_rules.columns])
data_gro_1 = Orange.data.Table.from_numpy(domain=domain_grocery,  X=input_assoc_rules.as_matrix(),Y= None)


# # Prune Dataset for frequently purchased items

# In[2]:


def prune_dataset(input_df, length_trans = 2, total_sales_perc = 0.5, start_item = None, end_item = None):
    if 'total_items' in input_df.columns:
        del(input_df['total_items'])
    item_count = input_df.sum().sort_values(ascending = False).reset_index()
    total_items = sum(input_df.sum().sort_values(ascending = False))
    item_count.rename(columns={item_count.columns[0]:'item_name',item_count.columns[1]:'item_count'}, inplace=True)
    if not start_item and not end_item: