예제 #1
0
    def test_read_and_save_attributes(self):
        samplefile = """\
        Feature 1\tFeature 2\tClass 1\tClass 42
        d        \tM F      \td      \td
                 \ta=1 b=2 \tclass x=a\\ longer\\ string \tclass
        1.0      \tM        \t5      \trich
        """
        file = io.StringIO(samplefile)
        table = read_tab_file(file)

        f1, f2, c1, c2 = table.domain.variables
        self.assertIsInstance(f2, DiscreteVariable)
        self.assertEqual(f2.name, "Feature 2")
        self.assertEqual(f2.attributes, {'a': 1, 'b': 2})
        self.assertIn(c1, table.domain.class_vars)
        self.assertIsInstance(c1, DiscreteVariable)
        self.assertEqual(c1.name, "Class 1")
        self.assertEqual(c1.attributes, {'x': 'a longer string'})
        outf = io.StringIO()
        outf.close = lambda: None
        TabReader.write_file(outf, table)
        saved = outf.getvalue()

        file = io.StringIO(saved)
        table = read_tab_file(file)

        f1, f2, c1, c2 = table.domain.variables
        self.assertIsInstance(f2, DiscreteVariable)
        self.assertEqual(f2.name, "Feature 2")
        self.assertEqual(f2.attributes, {'a': 1, 'b': 2})
        self.assertIn(c1, table.domain.class_vars)
        self.assertIsInstance(c1, DiscreteVariable)
        self.assertEqual(c1.name, "Class 1")
        self.assertEqual(c1.attributes, {'x': 'a longer string'})
예제 #2
0
    def test_read_and_save_attributes(self):
        samplefile = """\
        Feature 1\tFeature 2\tClass 1\tClass 42
        d        \tM F      \td      \td
                 \ta=1 b=2 \tclass x=a\\ longer\\ string \tclass
        1.0      \tM        \t5      \trich
        """
        file = io.StringIO(samplefile)
        table = read_tab_file(file)

        f1, f2, c1, c2 = table.domain.variables
        self.assertIsInstance(f2, DiscreteVariable)
        self.assertEqual(f2.name, "Feature 2")
        self.assertEqual(f2.attributes, {'a': 1, 'b': 2})
        self.assertIn(c1, table.domain.class_vars)
        self.assertIsInstance(c1, DiscreteVariable)
        self.assertEqual(c1.name, "Class 1")
        self.assertEqual(c1.attributes, {'x': 'a longer string'})
        outf = io.StringIO()
        outf.close = lambda: None
        TabReader.write_file(outf, table)
        saved = outf.getvalue()

        file = io.StringIO(saved)
        table = read_tab_file(file)

        f1, f2, c1, c2 = table.domain.variables
        self.assertIsInstance(f2, DiscreteVariable)
        self.assertEqual(f2.name, "Feature 2")
        self.assertEqual(f2.attributes, {'a': 1, 'b': 2})
        self.assertIn(c1, table.domain.class_vars)
        self.assertIsInstance(c1, DiscreteVariable)
        self.assertEqual(c1.name, "Class 1")
        self.assertEqual(c1.attributes, {'x': 'a longer string'})
예제 #3
0
 def test_no_metadata(self):
     tempdir = tempfile.mkdtemp()
     table = Table("titanic")
     table.attributes = OrderedDict()
     fname = path.join(tempdir, "out.tab")
     TabReader.write_table_metadata(fname, table)
     self.assertFalse(path.isfile(fname + ".metadata"))
     shutil.rmtree(tempdir)
예제 #4
0
 def test_no_metadata(self):
     tempdir = tempfile.mkdtemp()
     try:
         self.data.attributes = OrderedDict()
         fname = path.join(tempdir, "out.tab")
         TabReader.write_table_metadata(fname, self.data)
         self.assertFalse(path.isfile(fname + ".metadata"))
     finally:
         shutil.rmtree(tempdir)
예제 #5
0
 def test_no_metadata(self):
     tempdir = tempfile.mkdtemp()
     try:
         self.data.attributes = OrderedDict()
         fname = path.join(tempdir, "out.tab")
         TabReader.write_table_metadata(fname, self.data)
         self.assertFalse(path.isfile(fname + ".metadata"))
     finally:
         shutil.rmtree(tempdir)
예제 #6
0
 def test_had_metadata_now_there_is_none(self):
     tempdir = tempfile.mkdtemp()
     try:
         self.data.attributes["a"] = "aa"
         fname = path.join(tempdir, "out.tab")
         TabReader.write_table_metadata(fname, self.data)
         self.assertTrue(path.isfile(fname + ".metadata"))
         del self.data.attributes["a"]
         TabReader.write_table_metadata(fname, self.data)
         self.assertFalse(path.isfile(fname + ".metadata"))
     finally:
         shutil.rmtree(tempdir)
예제 #7
0
 def test_metadata(self):
     tempdir = tempfile.mkdtemp()
     try:
         table = Table("titanic")
         table.attributes = OrderedDict()
         table.attributes["a"] = "aa"
         table.attributes["b"] = "bb"
         fname = path.join(tempdir, "out.tab")
         TabReader.write_table_metadata(fname, table)
         self.assertTrue(path.isfile(fname + ".metadata"))
     finally:
         shutil.rmtree(tempdir)
 def test_metadata(self):
     tempdir = tempfile.mkdtemp()
     try:
         table = Table("titanic")
         table.attributes = OrderedDict()
         table.attributes["a"] = "aa"
         table.attributes["b"] = "bb"
         fname = path.join(tempdir, "out.tab")
         TabReader.write_table_metadata(fname, table)
         self.assertTrue(path.isfile(fname + ".metadata"))
     finally:
         shutil.rmtree(tempdir)
예제 #9
0
 def test_had_metadata_now_there_is_none(self):
     tempdir = tempfile.mkdtemp()
     try:
         self.data.attributes["a"] = "aa"
         fname = path.join(tempdir, "out.tab")
         TabReader.write_table_metadata(fname, self.data)
         self.assertTrue(path.isfile(fname + ".metadata"))
         del self.data.attributes["a"]
         TabReader.write_table_metadata(fname, self.data)
         self.assertFalse(path.isfile(fname + ".metadata"))
     finally:
         shutil.rmtree(tempdir)
예제 #10
0
    def table_from_html(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        try:
            html_table = soup.find_all('table')[-1]
        except IndexError:
            raise DataEmptyError

        if '<h2>Anal' in html or 'div_analiza_' in html:
            raise DataIsAnalError

        def _header_row_strings(row):
            return chain.from_iterable(
                repeat(th.get_text(), int(th.get('colspan') or 1)) for th in
                html_table.select('thead tr:nth-of-type(%d) th[title]' % row))

        # self.DATETIME_VAR (available when Paradata is enabled in 1ka UI)
        # should match this variable name format
        header = [
            th1.rstrip(':') +
            ('' if th3 == th1 else ' ({})').format(th3.rstrip(':'))
            for th1, th3 in zip(_header_row_strings(1), _header_row_strings(3))
        ]
        values = [
            [
                (  # If no span, feature is a number or a text field
                    td.get_text() if td.span is None else
                    # If have span, it's a number, but if negative, replace with NaN
                    '' if td.contents[0].strip().startswith('-') else
                    # Else if span, the number is its code, but we want its value
                    td.span.get_text()[1:-1]) for td in tr.select('td')
                if 'data_uid' not in td.get('class', ())
            ] for tr in html_table.select('tbody tr')
        ]

        # Save parsed values into in-mem file for default values processing
        buffer = StringIO()
        writer = csv.writer(buffer, delimiter='\t')
        writer.writerow(header)
        writer.writerows(values)
        buffer.flush()
        buffer.seek(0)

        data = TabReader(buffer).read()

        title = soup.select('body h2:nth-of-type(1)')[0].get_text().split(
            ': ', maxsplit=1)[-1]
        data.name = title

        return data
예제 #11
0
    def test_read_and_save_attributes(self):
        samplefile = """\
        Feature 1\tFeature 2\tClass 1\tClass 42
        d        \tM F      \td      \td
                 \ta=1 b=2 \tclass x=a\\ longer\\ string \tclass
        1.0      \tM        \t5      \trich
        """
        file = io.StringIO(samplefile)
        table = read_tab_file(file)

        f1, f2, c1, c2 = table.domain.variables
        self.assertIsInstance(f2, DiscreteVariable)
        self.assertEqual(f2.name, "Feature 2")
        self.assertEqual(f2.attributes, {"a": 1, "b": 2})
        self.assertIn(c1, table.domain.class_vars)
        self.assertIsInstance(c1, DiscreteVariable)
        self.assertEqual(c1.name, "Class 1")
        self.assertEqual(c1.attributes, {"x": "a longer string"})
        outf = io.StringIO()
        outf.close = lambda: None
        TabReader.write_file(outf, table)
        saved = outf.getvalue()

        file = io.StringIO(saved)
        table = read_tab_file(file)

        f1, f2, c1, c2 = table.domain.variables
        self.assertIsInstance(f2, DiscreteVariable)
        self.assertEqual(f2.name, "Feature 2")
        self.assertEqual(f2.attributes, {"a": 1, "b": 2})
        self.assertIn(c1, table.domain.class_vars)
        self.assertIsInstance(c1, DiscreteVariable)
        self.assertEqual(c1.name, "Class 1")
        self.assertEqual(c1.attributes, {"x": "a longer string"})

        path = "/path/to/somewhere"
        c1.attributes["path"] = path
        outf = io.StringIO()
        outf.close = lambda: None
        TabReader.write_file(outf, table)
        outf.seek(0)

        table = read_tab_file(outf)
        f1, f2, c1, c2 = table.domain.variables
        self.assertEqual(c1.attributes["path"], path)
예제 #12
0
 def test_many_discrete(self):
     b = io.StringIO()
     b.write("Poser\nd\n\n")
     b.writelines("K" + str(i) + "\n" for i in range(30000))
     start = time.time()
     _ = TabReader(b).read()
     elapsed = time.time() - start
     if elapsed > 2:
         raise AssertionError()
예제 #13
0
    def table_from_html(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        try:
            html_table = soup.find_all('table')[-1]
        except IndexError:
            raise DataEmptyError

        if '<h2>Anal' in html or 'div_analiza_' in html:
            raise DataIsAnalError

        def _header_row_strings(row):
            return chain.from_iterable(
                repeat(th.get_text(), int(th.get('colspan') or 1))
                for th in html_table.select('thead tr:nth-of-type(%d) th[title]' % row))

        # self.DATETIME_VAR (available when Paradata is enabled in 1ka UI)
        # should match this variable name format
        header = [th1.rstrip(':') + ('' if th3 == th1 else ' ({})').format(th3.rstrip(':'))
                  for th1, th3 in zip(_header_row_strings(1),
                                      _header_row_strings(3))]
        values = [[(# If no span, feature is a number or a text field
                    td.get_text() if td.span is None else
                    # If have span, it's a number, but if negative, replace with NaN
                    '' if td.contents[0].strip().startswith('-') else
                    # Else if span, the number is its code, but we want its value
                    td.span.get_text()[1:-1])
                   for td in tr.select('td')
                   if 'data_uid' not in td.get('class', ())]
                  for tr in html_table.select('tbody tr')]

        # Save parsed values into in-mem file for default values processing
        buffer = StringIO()
        writer = csv.writer(buffer, delimiter='\t')
        writer.writerow(header)
        writer.writerows(values)
        buffer.flush()
        buffer.seek(0)

        data = TabReader(buffer).read()

        title = soup.select('body h2:nth-of-type(1)')[0].get_text().split(': ', maxsplit=1)[-1]
        data.name = title

        return data
예제 #14
0
    def test_bad_data(self):
        """
        Firstly it creates predictions with TreeLearner. Then sends predictions and
        different data with different domain to Predictions widget. Those different
        data and domain are similar to original data and domain but they have three
        different target values instead of two.
        GH-2129
        """
        Variable._clear_all_caches()

        filestr1 = """\
        age\tsex\tsurvived
        d\td\td
        \t\tclass
        adult\tmale\tyes
        adult\tfemale\tno
        child\tmale\tyes
        child\tfemale\tyes
        """
        file1 = io.StringIO(filestr1)
        table = TabReader(file1).read()
        learner = TreeLearner()
        tree = learner(table)

        filestr2 = """\
        age\tsex\tsurvived
        d\td\td
        \t\tclass
        adult\tmale\tyes
        adult\tfemale\tno
        child\tmale\tyes
        child\tfemale\tunknown
        """
        file2 = io.StringIO(filestr2)
        bad_table = TabReader(file2).read()

        self.send_signal(self.widget.Inputs.predictors, tree, 1)

        with excepthook_catch():
            self.send_signal(self.widget.Inputs.data, bad_table)

        Variable._clear_all_caches(
        )  # so that test excepting standard titanic work
예제 #15
0
    def test_read_save_quoted(self):
        quoted = '''\
        S\tA
        s\td
        m\t
        """a"""\ti
        """b"""\tj
        """c\td"""\tk
        '''
        expected = ['"a"', '"b"', '"c\td"']
        f = io.StringIO(quoted)
        table = read_tab_file(f)
        self.assertSequenceEqual(table.metas[:, 0].tolist(), expected)

        f = io.StringIO()
        f.close = lambda: None
        TabReader.write_file(f, table)
        saved = f.getvalue()
        table1 = read_tab_file(io.StringIO(saved))
        self.assertSequenceEqual(table1.metas[:, 0].tolist(), expected)
예제 #16
0
    def test_read_save_quoted(self):
        quoted = '''\
        S\tA
        s\td
        m\t
        """a"""\ti
        """b"""\tj
        """c\td"""\tk
        '''
        expected = ['"a"', '"b"', '"c\td"']
        f = io.StringIO(quoted)
        table = read_tab_file(f)
        self.assertSequenceEqual(table.metas[:, 0].tolist(), expected)

        f = io.StringIO()
        f.close = lambda: None
        TabReader.write_file(f, table)
        saved = f.getvalue()
        table1 = read_tab_file(io.StringIO(saved))
        self.assertSequenceEqual(table1.metas[:, 0].tolist(), expected)
예제 #17
0
    def test_sheets(self):
        file1 = io.StringIO("\n".join("xd dbac"))
        reader = TabReader(file1)

        self.assertEqual(reader.sheets, ())
예제 #18
0
def read_tab_file(filename):
    return TabReader(filename).read()
예제 #19
0
 def test_data_name(self):
     table1 = Table('iris')
     table2 = TabReader(table1.__file__).read()
     self.assertEqual(table1.name, 'iris')
     self.assertEqual(table2.name, 'iris')