def test_read_and_save_attributes(self): samplefile = """\ Feature 1\tFeature 2\tClass 1\tClass 42 d \tM F \td \td \ta=1 b=2 \tclass x=a\\ longer\\ string \tclass 1.0 \tM \t5 \trich """ file = io.StringIO(samplefile) table = read_tab_file(file) f1, f2, c1, c2 = table.domain.variables self.assertIsInstance(f2, DiscreteVariable) self.assertEqual(f2.name, "Feature 2") self.assertEqual(f2.attributes, {'a': 1, 'b': 2}) self.assertIn(c1, table.domain.class_vars) self.assertIsInstance(c1, DiscreteVariable) self.assertEqual(c1.name, "Class 1") self.assertEqual(c1.attributes, {'x': 'a longer string'}) outf = io.StringIO() outf.close = lambda: None TabReader.write_file(outf, table) saved = outf.getvalue() file = io.StringIO(saved) table = read_tab_file(file) f1, f2, c1, c2 = table.domain.variables self.assertIsInstance(f2, DiscreteVariable) self.assertEqual(f2.name, "Feature 2") self.assertEqual(f2.attributes, {'a': 1, 'b': 2}) self.assertIn(c1, table.domain.class_vars) self.assertIsInstance(c1, DiscreteVariable) self.assertEqual(c1.name, "Class 1") self.assertEqual(c1.attributes, {'x': 'a longer string'})
def test_no_metadata(self): tempdir = tempfile.mkdtemp() table = Table("titanic") table.attributes = OrderedDict() fname = path.join(tempdir, "out.tab") TabReader.write_table_metadata(fname, table) self.assertFalse(path.isfile(fname + ".metadata")) shutil.rmtree(tempdir)
def test_no_metadata(self): tempdir = tempfile.mkdtemp() try: self.data.attributes = OrderedDict() fname = path.join(tempdir, "out.tab") TabReader.write_table_metadata(fname, self.data) self.assertFalse(path.isfile(fname + ".metadata")) finally: shutil.rmtree(tempdir)
def test_had_metadata_now_there_is_none(self): tempdir = tempfile.mkdtemp() try: self.data.attributes["a"] = "aa" fname = path.join(tempdir, "out.tab") TabReader.write_table_metadata(fname, self.data) self.assertTrue(path.isfile(fname + ".metadata")) del self.data.attributes["a"] TabReader.write_table_metadata(fname, self.data) self.assertFalse(path.isfile(fname + ".metadata")) finally: shutil.rmtree(tempdir)
def test_metadata(self): tempdir = tempfile.mkdtemp() try: table = Table("titanic") table.attributes = OrderedDict() table.attributes["a"] = "aa" table.attributes["b"] = "bb" fname = path.join(tempdir, "out.tab") TabReader.write_table_metadata(fname, table) self.assertTrue(path.isfile(fname + ".metadata")) finally: shutil.rmtree(tempdir)
def table_from_html(self, html): soup = BeautifulSoup(html, 'html.parser') try: html_table = soup.find_all('table')[-1] except IndexError: raise DataEmptyError if '<h2>Anal' in html or 'div_analiza_' in html: raise DataIsAnalError def _header_row_strings(row): return chain.from_iterable( repeat(th.get_text(), int(th.get('colspan') or 1)) for th in html_table.select('thead tr:nth-of-type(%d) th[title]' % row)) # self.DATETIME_VAR (available when Paradata is enabled in 1ka UI) # should match this variable name format header = [ th1.rstrip(':') + ('' if th3 == th1 else ' ({})').format(th3.rstrip(':')) for th1, th3 in zip(_header_row_strings(1), _header_row_strings(3)) ] values = [ [ ( # If no span, feature is a number or a text field td.get_text() if td.span is None else # If have span, it's a number, but if negative, replace with NaN '' if td.contents[0].strip().startswith('-') else # Else if span, the number is its code, but we want its value td.span.get_text()[1:-1]) for td in tr.select('td') if 'data_uid' not in td.get('class', ()) ] for tr in html_table.select('tbody tr') ] # Save parsed values into in-mem file for default values processing buffer = StringIO() writer = csv.writer(buffer, delimiter='\t') writer.writerow(header) writer.writerows(values) buffer.flush() buffer.seek(0) data = TabReader(buffer).read() title = soup.select('body h2:nth-of-type(1)')[0].get_text().split( ': ', maxsplit=1)[-1] data.name = title return data
def test_read_and_save_attributes(self): samplefile = """\ Feature 1\tFeature 2\tClass 1\tClass 42 d \tM F \td \td \ta=1 b=2 \tclass x=a\\ longer\\ string \tclass 1.0 \tM \t5 \trich """ file = io.StringIO(samplefile) table = read_tab_file(file) f1, f2, c1, c2 = table.domain.variables self.assertIsInstance(f2, DiscreteVariable) self.assertEqual(f2.name, "Feature 2") self.assertEqual(f2.attributes, {"a": 1, "b": 2}) self.assertIn(c1, table.domain.class_vars) self.assertIsInstance(c1, DiscreteVariable) self.assertEqual(c1.name, "Class 1") self.assertEqual(c1.attributes, {"x": "a longer string"}) outf = io.StringIO() outf.close = lambda: None TabReader.write_file(outf, table) saved = outf.getvalue() file = io.StringIO(saved) table = read_tab_file(file) f1, f2, c1, c2 = table.domain.variables self.assertIsInstance(f2, DiscreteVariable) self.assertEqual(f2.name, "Feature 2") self.assertEqual(f2.attributes, {"a": 1, "b": 2}) self.assertIn(c1, table.domain.class_vars) self.assertIsInstance(c1, DiscreteVariable) self.assertEqual(c1.name, "Class 1") self.assertEqual(c1.attributes, {"x": "a longer string"}) path = "/path/to/somewhere" c1.attributes["path"] = path outf = io.StringIO() outf.close = lambda: None TabReader.write_file(outf, table) outf.seek(0) table = read_tab_file(outf) f1, f2, c1, c2 = table.domain.variables self.assertEqual(c1.attributes["path"], path)
def test_many_discrete(self): b = io.StringIO() b.write("Poser\nd\n\n") b.writelines("K" + str(i) + "\n" for i in range(30000)) start = time.time() _ = TabReader(b).read() elapsed = time.time() - start if elapsed > 2: raise AssertionError()
def table_from_html(self, html): soup = BeautifulSoup(html, 'html.parser') try: html_table = soup.find_all('table')[-1] except IndexError: raise DataEmptyError if '<h2>Anal' in html or 'div_analiza_' in html: raise DataIsAnalError def _header_row_strings(row): return chain.from_iterable( repeat(th.get_text(), int(th.get('colspan') or 1)) for th in html_table.select('thead tr:nth-of-type(%d) th[title]' % row)) # self.DATETIME_VAR (available when Paradata is enabled in 1ka UI) # should match this variable name format header = [th1.rstrip(':') + ('' if th3 == th1 else ' ({})').format(th3.rstrip(':')) for th1, th3 in zip(_header_row_strings(1), _header_row_strings(3))] values = [[(# If no span, feature is a number or a text field td.get_text() if td.span is None else # If have span, it's a number, but if negative, replace with NaN '' if td.contents[0].strip().startswith('-') else # Else if span, the number is its code, but we want its value td.span.get_text()[1:-1]) for td in tr.select('td') if 'data_uid' not in td.get('class', ())] for tr in html_table.select('tbody tr')] # Save parsed values into in-mem file for default values processing buffer = StringIO() writer = csv.writer(buffer, delimiter='\t') writer.writerow(header) writer.writerows(values) buffer.flush() buffer.seek(0) data = TabReader(buffer).read() title = soup.select('body h2:nth-of-type(1)')[0].get_text().split(': ', maxsplit=1)[-1] data.name = title return data
def test_bad_data(self): """ Firstly it creates predictions with TreeLearner. Then sends predictions and different data with different domain to Predictions widget. Those different data and domain are similar to original data and domain but they have three different target values instead of two. GH-2129 """ Variable._clear_all_caches() filestr1 = """\ age\tsex\tsurvived d\td\td \t\tclass adult\tmale\tyes adult\tfemale\tno child\tmale\tyes child\tfemale\tyes """ file1 = io.StringIO(filestr1) table = TabReader(file1).read() learner = TreeLearner() tree = learner(table) filestr2 = """\ age\tsex\tsurvived d\td\td \t\tclass adult\tmale\tyes adult\tfemale\tno child\tmale\tyes child\tfemale\tunknown """ file2 = io.StringIO(filestr2) bad_table = TabReader(file2).read() self.send_signal(self.widget.Inputs.predictors, tree, 1) with excepthook_catch(): self.send_signal(self.widget.Inputs.data, bad_table) Variable._clear_all_caches( ) # so that test excepting standard titanic work
def test_read_save_quoted(self): quoted = '''\ S\tA s\td m\t """a"""\ti """b"""\tj """c\td"""\tk ''' expected = ['"a"', '"b"', '"c\td"'] f = io.StringIO(quoted) table = read_tab_file(f) self.assertSequenceEqual(table.metas[:, 0].tolist(), expected) f = io.StringIO() f.close = lambda: None TabReader.write_file(f, table) saved = f.getvalue() table1 = read_tab_file(io.StringIO(saved)) self.assertSequenceEqual(table1.metas[:, 0].tolist(), expected)
def test_sheets(self): file1 = io.StringIO("\n".join("xd dbac")) reader = TabReader(file1) self.assertEqual(reader.sheets, ())
def read_tab_file(filename): return TabReader(filename).read()
def test_data_name(self): table1 = Table('iris') table2 = TabReader(table1.__file__).read() self.assertEqual(table1.name, 'iris') self.assertEqual(table2.name, 'iris')