def test_preserve_html_None(self): html = dedent(''' <html> <body> <table> <tr> <td><b>f1</b></td> <td>f2</td> <td>f3</td> </tr> <tr> <td><i>r0f1</i></td> <td><i>r0f2</i></td> <td><i>r0f3</i></td> </tr> </table> </body> </html> ''').encode('utf-8') table = rows.import_from_html(BytesIO(html), encoding='utf-8', preserve_html=True) table2 = rows.import_from_html(BytesIO(html), encoding='utf-8', preserve_html=False) self.assertEqual(table[0].f1, '<i>r0f1</i>') self.assertEqual(table[0].f2, '<i>r0f2</i>') self.assertEqual(table[0].f3, '<i>r0f3</i>')
def test_preserve_html_None(self): html = dedent( """ <html> <body> <table> <tr> <td><b>f1</b></td> <td>f2</td> <td>f3</td> </tr> <tr> <td><i>r0f1</i></td> <td><i>r0f2</i></td> <td><i>r0f3</i></td> </tr> </table> </body> </html> """ ).encode("utf-8") table = rows.import_from_html( BytesIO(html), encoding="utf-8", preserve_html=True ) table2 = rows.import_from_html( BytesIO(html), encoding="utf-8", preserve_html=False ) self.assertEqual(table[0].f1, "<i>r0f1</i>") self.assertEqual(table[0].f2, "<i>r0f2</i>") self.assertEqual(table[0].f3, "<i>r0f3</i>")
def extract_data(): url = "https://www.receita.fazenda.gov.br/pessoajuridica/cnpj/tabelas/natjurqualificaresponsavel.htm" response = requests.get(url, verify=False) table_1 = rows.import_from_html( io.BytesIO(response.content), encoding=response.encoding, index=0, ignore_colspan=False ) table_2 = rows.import_from_html( io.BytesIO(response.content), encoding=response.encoding, index=1, ignore_colspan=False ) categoria, codigo_categoria = None, None for row in chain(table_1, table_2): row = {key: clear_text(value) for key, value in row._asdict().items()} codigo = row["codigo"] if ". " in codigo: categoria = codigo.title() split_index = categoria.find(". ") codigo_categoria, categoria = categoria[:split_index], categoria[split_index + 2 :] continue else: row["codigo"] = int(codigo.replace("-", "")) row["categoria"] = categoria row["codigo_categoria"] = codigo_categoria row["qualificacao"] = [item.strip() for item in row["qualificacao"].replace(" ou ", ", ").split(",")] yield row
def retrieve_legislators(self, url): html = BaseCollector.retrieve_uri(self, url, post_process=False, force_encoding='utf-8') return rows.import_from_html(BytesIO(html.encode('utf-8')), preserve_html=True)
def test_ignore_colspan(self): filename = 'tests/data/colspan-table.html' fobj = open(filename) table = rows.import_from_html(fobj, ignore_colspan=True) self.assertEqual(set(table.fields.keys()), set(['field1', 'field2'])) self.assertEqual(len(table), 2) self.assertEqual(table[0].field1, 'row1field1') self.assertEqual(table[0].field2, 'row1field2') self.assertEqual(table[1].field1, 'row2field1') self.assertEqual(table[1].field2, 'row2field2') fobj = open(filename) with self.assertRaises(ValueError) as raises: table = rows.import_from_html(fobj, ignore_colspan=False) self.assertEqual(raises.exception.message, 'Number of fields differ')
def parse_budget(self, year, action): table = rows.import_from_html( io.BytesIO(self.browser.html.encode("utf-8")), index=10, force_types={ "dotacao_inicial": BRDecimalField, "dotacao_atual": BRDecimalField, "empenhado": BRDecimalField, "liquidado": BRDecimalField, "pago": BRDecimalField, "pago_restos": BRDecimalField, }, ) result = [] for row in table: if row.elemento == "TOTAL": continue row = row._asdict() row.update({ "ano": year, "codigo_acao": action, "estado": "SP", }) result.append(row) return rows.import_from_dicts(result)
def test_ignore_colspan(self): filename = 'tests/data/colspan-table.html' fobj = open(filename, mode='rb') table = rows.import_from_html(fobj, ignore_colspan=True) self.assertEqual(set(table.fields.keys()), set(['field1', 'field2'])) self.assertEqual(len(table), 2) self.assertEqual(table[0].field1, 'row1field1') self.assertEqual(table[0].field2, 'row1field2') self.assertEqual(table[1].field1, 'row2field1') self.assertEqual(table[1].field2, 'row2field2') fobj = open(filename, mode='rb') with self.assertRaises(ValueError) as raises: table = rows.import_from_html(fobj, ignore_colspan=False) self.assertEqual(raises.exception.args[0], 'Number of fields differ')
def download_years(): "Return a list with the game's years as integers" response = requests.get(URL_YEARS) html = response.content games = rows.import_from_html(BytesIO(html), encoding='utf-8') return [game.year for game in games]
def test_extract_properties(self): filename = 'tests/data/properties-table.html' fobj = open(filename) table = rows.import_from_html(fobj, properties=True) self.assertEqual(table.fields.keys(), ['field1', 'field2', 'properties']) self.assertEqual(table.fields.values(), [ rows.fields.TextField, rows.fields.TextField, rows.fields.JSONField ]) properties_1 = { 'class': 'some-class another-class', 'data-test': 'value', } properties_2 = { 'class': 'css-class', 'data-test': 'value2', } self.assertEqual(len(table), 2) self.assertEqual(table[0].field1, 'row1field1') self.assertEqual(table[0].field2, 'row1field2') self.assertEqual(table[0].properties, properties_1) self.assertEqual(table[1].field1, 'row2field1') self.assertEqual(table[1].field2, 'row2field2') self.assertEqual(table[1].properties, properties_2)
def convert(self): convert_url = self.cleaned_data.get('convert_url') convert_file = self.cleaned_data.get('convert_file') type_to = self.cleaned_data.get('type_to') if convert_file: path = os.path.join(settings.MEDIA_ROOT, default_storage.save(convert_file.name, ContentFile(convert_file.read()))) convert_type = convert_file.name.split('.')[-1] # Import data = getattr(rows, 'import_from_%s' % convert_type)(path) # Export result = StringIO.StringIO() getattr(rows, 'export_to_%s' % type_to)(data, result) os.unlink(path) return result else: path = BytesIO(requests.get(convert_url).content) convert_type = 'html' # Import data = rows.import_from_html(path, preserve_html=True) # Export result = StringIO.StringIO() getattr(rows, 'export_to_%s' % type_to)(data, result) return result
def test_nested_tables_outer(self): filename = "tests/data/nested-table.html" fobj = open(filename, mode="rb") table = rows.import_from_html(fobj) self.assertEqual( set(table.fields.keys()), set(["t00r0c0", "t00r0c1", "t00r0c2"]) ) self.assertEqual(len(table), 3) self.assertEqual(table[0].t00r0c0, "t0,0r1c0") self.assertEqual(table[0].t00r0c1, "t0,0r1c1") self.assertEqual(table[0].t00r0c2, "t0,0r1c2") # if there are nested tables, the inner ones will be represented as # strings (each <td>...</td> element will return only one string, even # if there is a <table> inside it) inner_table = ( "t0,1r0c0 t0,1r0c1 t0,1r1c0 t0,1r1c1 t0,1r2c0 " "t0,1r2c1 t0,2r0c0 t0,2r0c1 t0,2r1c0 t0,2r1c1 " "t0,1r3c1 t0,1r4c0 t0,1r4c1 t0,1r5c0 t0,1r5c1" ) self.assertEqual(table[1].t00r0c0, "t0,0r2c0") self.assertEqual(table[1].t00r0c1, inner_table) self.assertEqual(table[1].t00r0c2, "t0,0r2c2") self.assertEqual(table[2].t00r0c0, "t0,0r3c0") self.assertEqual(table[2].t00r0c1, "t0,0r3c1") self.assertEqual(table[2].t00r0c2, "t0,0r3c2")
def test_import_from_html_filename(self): table = rows.import_from_html(self.filename, encoding=self.encoding) self.assert_table_equal(table, utils.table) expected_meta = {'imported_from': 'html', 'filename': self.filename, 'encoding': self.encoding,} self.assertEqual(table.meta, expected_meta)
def test_export_to_html_filename(self): # TODO: may test file contents temp = tempfile.NamedTemporaryFile(delete=False) self.files_to_delete.append(temp.name) rows.export_to_html(utils.table, temp.name) table = rows.import_from_html(temp.name) self.assert_table_equal(table, utils.table)
def test_import_from_html_fobj(self): # TODO: may test with codecs.open passing an encoding with open(self.filename) as fobj: table = rows.import_from_html(fobj, encoding=self.encoding) self.assert_table_equal(table, utils.table) expected_meta = {'imported_from': 'html', 'filename': self.filename,} self.assertEqual(table.meta, expected_meta)
def test_table_index(self): filename = "tests/data/simple-table.html" fobj = open(filename, mode="rb") table_1 = rows.import_from_html(fobj) self.assertEqual(set(table_1.fields.keys()), set(["t0r0c0", "t0r0c1"])) self.assertEqual(len(table_1), 1) self.assertEqual(table_1[0].t0r0c0, "t0r1c0") self.assertEqual(table_1[0].t0r0c1, "t0r1c1") fobj.seek(0) table_2 = rows.import_from_html(fobj, index=1) self.assertEqual(set(table_2.fields.keys()), set(["t1r0c0", "t1r0c1"])) self.assertEqual(len(table_2), 2) self.assertEqual(table_2[0].t1r0c0, "t1r1c0") self.assertEqual(table_2[0].t1r0c1, "t1r1c1") self.assertEqual(table_2[1].t1r0c0, "t1r2c0") self.assertEqual(table_2[1].t1r0c1, "t1r2c1")
def test_table_index(self): filename = 'tests/data/simple-table.html' fobj = open(filename, mode='rb') table_1 = rows.import_from_html(fobj) self.assertEqual(set(table_1.fields.keys()), set(['t0r0c0', 't0r0c1'])) self.assertEqual(len(table_1), 1) self.assertEqual(table_1[0].t0r0c0, 't0r1c0') self.assertEqual(table_1[0].t0r0c1, 't0r1c1') fobj.seek(0) table_2 = rows.import_from_html(fobj, index=1) self.assertEqual(set(table_2.fields.keys()), set(['t1r0c0', 't1r0c1'])) self.assertEqual(len(table_2), 2) self.assertEqual(table_2[0].t1r0c0, 't1r1c0') self.assertEqual(table_2[0].t1r0c1, 't1r1c1') self.assertEqual(table_2[1].t1r0c0, 't1r2c0') self.assertEqual(table_2[1].t1r0c1, 't1r2c1')
def test_table_index(self): filename = 'tests/data/simple-table.html' fobj = open(filename) table_1 = rows.import_from_html(fobj) self.assertEqual(set(table_1.fields.keys()), set(['t0r0c0', 't0r0c1'])) self.assertEqual(len(table_1), 1) self.assertEqual(table_1[0].t0r0c0, 't0r1c0') self.assertEqual(table_1[0].t0r0c1, 't0r1c1') fobj.seek(0) table_2 = rows.import_from_html(fobj, index=1) self.assertEqual(set(table_2.fields.keys()), set(['t1r0c0', 't1r0c1'])) self.assertEqual(len(table_2), 2) self.assertEqual(table_2[0].t1r0c0, 't1r1c0') self.assertEqual(table_2[0].t1r0c1, 't1r1c1') self.assertEqual(table_2[1].t1r0c0, 't1r2c0') self.assertEqual(table_2[1].t1r0c1, 't1r2c1')
def test_export_to_html_fobj(self): # TODO: may test with codecs.open passing an encoding # TODO: may test file contents temp = tempfile.NamedTemporaryFile(delete=False, mode="wb") self.files_to_delete.append(temp.name) rows.export_to_html(utils.table, temp.file) table = rows.import_from_html(temp.name) self.assert_table_equal(table, utils.table)
def test_nested_tables_second_inner(self): filename = "tests/data/nested-table.html" fobj = open(filename, mode="rb") table = rows.import_from_html(fobj, index=2) self.assertEqual(set(table.fields.keys()), set(["t02r0c0", "t02r0c1"])) self.assertEqual(len(table), 1) self.assertEqual(table[0].t02r0c0, "t0,2r1c0") self.assertEqual(table[0].t02r0c1, "t0,2r1c1")
def test_import_from_html_filename(self): table = rows.import_from_html(self.filename, encoding=self.encoding) self.assert_table_equal(table, utils.table) expected_meta = { "imported_from": "html", "filename": self.filename, "encoding": self.encoding, } self.assertEqual(table.meta, expected_meta)
def extract(self): table = rows.import_from_html( self.filename, encoding="iso-8859-1", row_tag="//tr[not(@bgcolor)]", fields=self.fields, skip_header=False, ) for row in table: yield row._asdict()
def test_import_from_html_fobj(self): # TODO: may test with codecs.open passing an encoding with open(self.filename, mode='rb') as fobj: table = rows.import_from_html(fobj, encoding=self.encoding) self.assert_table_equal(table, utils.table) expected_meta = {'imported_from': 'html', 'filename': self.filename, 'encoding': self.encoding,} self.assertEqual(table.meta, expected_meta)
def test_nested_tables_second_inner(self): filename = 'tests/data/nested-table.html' fobj = open(filename, mode='rb') table = rows.import_from_html(fobj, index=2) self.assertEqual(set(table.fields.keys()), set(['t02r0c0', 't02r0c1'])) self.assertEqual(len(table), 1) self.assertEqual(table[0].t02r0c0, 't0,2r1c0') self.assertEqual(table[0].t02r0c1, 't0,2r1c1')
def test_preserve_html_and_not_skip_header(self, mocked_create_table): filename = "tests/data/table-with-sections.html" # If `import_from_html` needs to identify field names, then it # should not preserve HTML inside first row table_1 = rows.import_from_html(filename, index=1, preserve_html=True) call_args = mocked_create_table.call_args_list.pop() data = list(call_args[0][0]) kwargs = call_args[1] self.assertEqual(kwargs.get("fields", None), None) self.assertEqual(len(data), 6) self.assertNotIn("<", data[0][1]) self.assertNotIn(">", data[0][1]) for row in data[1:]: # Second field has HTML self.assertIn("<", row[1]) self.assertIn(">", row[1]) # If we provide fields and ask to preserve HTML and to don't skip # header then it should strip HTML from every row fields = OrderedDict( [ ("first", rows.fields.TextField), ("second", rows.fields.TextField), ("third", rows.fields.TextField), ("fourth", rows.fields.TextField), ] ) table_2 = rows.import_from_html( filename, index=1, fields=fields, preserve_html=True, skip_header=False ) call_args = mocked_create_table.call_args_list.pop() data = list(call_args[0][0]) kwargs = call_args[1] self.assertEqual(kwargs.get("fields", None), fields) self.assertEqual(len(data), 6) for row in data: # Second field has HTML and should not be stripped self.assertIn("<", row[1]) self.assertIn(">", row[1])
def test_table_thead_tbody(self): filename = "tests/data/table-thead-tbody.html" fobj = open(filename, mode="rb") table = rows.import_from_html(fobj) self.assertEqual(set(table.fields.keys()), set(["t1", "t2"])) self.assertEqual(len(table), 2) self.assertEqual(table[0].t1, "456") self.assertEqual(table[0].t2, "123") self.assertEqual(table[1].t1, "qqq") self.assertEqual(table[1].t2, "aaa")
def test_issue_168(self): temp = tempfile.NamedTemporaryFile(delete=False) filename = "{}.{}".format(temp.name, self.file_extension) self.files_to_delete.append(filename) table = rows.Table(fields=OrderedDict([("jsoncolumn", rows.fields.JSONField)])) table.append({"jsoncolumn": '{"python": 42}'}) rows.export_to_html(table, filename) table2 = rows.import_from_html(filename) self.assert_table_equal(table, table2)
def test_table_thead_tbody(self): filename = 'tests/data/table-thead-tbody.html' fobj = open(filename, mode='rb') table = rows.import_from_html(fobj) self.assertEqual(set(table.fields.keys()), set(['t1', 't2'])) self.assertEqual(len(table), 2) self.assertEqual(table[0].t1, '456') self.assertEqual(table[0].t2, '123') self.assertEqual(table[1].t1, 'qqq') self.assertEqual(table[1].t2, 'aaa')
def test_nested_tables_second_inner(self): filename = 'tests/data/nested-table.html' fobj = open(filename) table = rows.import_from_html(fobj, index=2) self.assertEqual(set(table.fields.keys()), set(['t02r0c0', 't02r0c1'])) self.assertEqual(len(table), 1) self.assertEqual(table[0].t02r0c0, 't0,2r1c0') self.assertEqual(table[0].t02r0c1, 't0,2r1c1')
def test_table_thead_tbody(self): filename = 'tests/data/table-thead-tbody.html' fobj = open(filename) table = rows.import_from_html(fobj) self.assertEqual(set(table.fields.keys()), set(['t1', 't2'])) self.assertEqual(len(table), 2) self.assertEqual(table[0].t1, '456') self.assertEqual(table[0].t2, '123') self.assertEqual(table[1].t1, 'qqq') self.assertEqual(table[1].t2, 'aaa')
def test_import_from_html_uses_create_table(self, mocked_create_table): mocked_create_table.return_value = 42 kwargs = {'encoding': 'iso-8859-15', 'some_key': 123, 'other': 456, } result = rows.import_from_html(self.filename, **kwargs) self.assertTrue(mocked_create_table.called) self.assertEqual(mocked_create_table.call_count, 1) self.assertEqual(result, 42) call = mocked_create_table.call_args kwargs['meta'] = {'imported_from': 'html', 'filename': self.filename, } self.assertEqual(call[1], kwargs)
def parse_process(self, response): row = response.request.meta['row'] body = response.body_as_unicode() table = rows.import_from_html(io.BytesIO(body.encode('utf-8')), encoding='utf-8', index=1) andamentos = [row.andamento for row in table] row['andamentos'] = '|'.join(andamentos) return row
def test_import_from_html_fobj(self): # TODO: may test with codecs.open passing an encoding with open(self.filename, mode="rb") as fobj: table = rows.import_from_html(fobj, encoding=self.encoding) self.assert_table_equal(table, utils.table) expected_meta = { "imported_from": "html", "filename": self.filename, "encoding": self.encoding, } self.assertEqual(table.meta, expected_meta)
def parse_movements(self, response): process = response.request.meta['process'] body = response.body_as_unicode() table = rows.import_from_html( io.BytesIO(body.encode('utf-8')), encoding='utf-8', index=1, force_types={'data': PtBrDateField}, ) for row in table: row = dict(row._asdict()) row['numero_processo'] = process['numero_processo'] row['classe_processo'] = process['classe_processo'] yield row
def test_ignore_colspan(self): filename = "tests/data/colspan-table.html" fobj = open(filename, mode="rb") table = rows.import_from_html(fobj, ignore_colspan=True) self.assertEqual(set(table.fields.keys()), set(["field1", "field2"])) self.assertEqual(len(table), 2) self.assertEqual(table[0].field1, "row1field1") self.assertEqual(table[0].field2, "row1field2") self.assertEqual(table[1].field1, "row2field1") self.assertEqual(table[1].field2, "row2field2") fobj = open(filename, mode="rb") table = rows.import_from_html(fobj, ignore_colspan=False) self.assertEquals(list(table.fields.keys()), ["huge_title", "field_1"]) self.assertEquals(len(table), 3) expected_data = [ ["field1", "field2"], ["row1field1", "row1field2"], ["row2field1", "row2field2"], ] for row_data, table_row in zip(expected_data, table): self.assertEqual(row_data, [table_row.huge_title, table_row.field_1])
def test_import_from_html_uses_create_table(self, mocked_create_table): mocked_create_table.return_value = 42 kwargs = {'some_key': 123, 'other': 456, } result = rows.import_from_html(self.filename, encoding='iso-8859-1', **kwargs) self.assertTrue(mocked_create_table.called) self.assertEqual(mocked_create_table.call_count, 1) self.assertEqual(result, 42) call = mocked_create_table.call_args kwargs['meta'] = {'imported_from': 'html', 'filename': self.filename, 'encoding': 'iso-8859-1',} self.assertEqual(call[1], kwargs)
def test_import_from_html_uses_create_table(self, mocked_create_table): mocked_create_table.return_value = 42 kwargs = {"some_key": 123, "other": 456} result = rows.import_from_html(self.filename, encoding="iso-8859-1", **kwargs) self.assertTrue(mocked_create_table.called) self.assertEqual(mocked_create_table.call_count, 1) self.assertEqual(result, 42) call = mocked_create_table.call_args kwargs["meta"] = { "imported_from": "html", "filename": self.filename, "encoding": "iso-8859-1", } self.assertEqual(call[1], kwargs)
def search_router_database(query): response = requests.post(URL_ROUTER_SEARCH, data={'action': 'routerList', 'criteria': query, 'site': 'drupal', }) table = rows.import_from_html(BytesIO(response.content), encoding=response.encoding, properties=True) fields = OrderedDict() fields['id'] = rows.fields.IntegerField for field_name in FIELD_NAMES: if field_name in table.fields: fields[field_name] = table.fields[field_name] return rows.transform(fields, transform_row, table)
def test_preserve_html(self): filename = 'tests/data/nested-table.html' fobj = open(filename) table = rows.import_from_html(fobj, preserve_html=True) expected_data = [ '<table>', '<tr>', '<td> t0,1r0c0 </td>', '<td> t0,1r0c1 </td>', '</tr>', '<tr>', '<td> t0,1r1c0 </td>', '<td> t0,1r1c1 </td>', '</tr>', '<tr>', '<td> t0,1r2c0 </td>', '<td> t0,1r2c1 </td>', '</tr>', '<tr>', '<td>', '<table>', '<tr>', '<td> t0,2r0c0 </td>', '<td> t0,2r0c1 </td>', '</tr>', '<tr>', '<td> t0,2r1c0 </td>', '<td> t0,2r1c1 </td>', '</tr>', '</table>', '</td>', '<td> t0,1r3c1 </td>', '</tr>', '<tr>', '<td> t0,1r4c0 </td>', '<td> t0,1r4c1 </td>', '</tr>', '<tr>', '<td> t0,1r5c0 </td>', '<td> t0,1r5c1 </td>', '</tr>', '</table>'] self.assertEqual(cleanup_lines(table[1].t00r0c1), expected_data)
def test_preserve_html(self): filename = "tests/data/nested-table.html" fobj = open(filename, mode="rb") table = rows.import_from_html(fobj, preserve_html=True) # TODO: test without passing encoding expected_data = [ "<table>", "<tr>", "<td> t0,1r0c0 </td>", "<td> t0,1r0c1 </td>", "</tr>", "<tr>", "<td> t0,1r1c0 </td>", "<td> t0,1r1c1 </td>", "</tr>", "<tr>", "<td> t0,1r2c0 </td>", "<td> t0,1r2c1 </td>", "</tr>", "<tr>", "<td>", "<table>", "<tr>", "<td> t0,2r0c0 </td>", "<td> t0,2r0c1 </td>", "</tr>", "<tr>", "<td> t0,2r1c0 </td>", "<td> t0,2r1c1 </td>", "</tr>", "</table>", "</td>", "<td> t0,1r3c1 </td>", "</tr>", "<tr>", "<td> t0,1r4c0 </td>", "<td> t0,1r4c1 </td>", "</tr>", "<tr>", "<td> t0,1r5c0 </td>", "<td> t0,1r5c1 </td>", "</tr>", "</table>", ] self.assertEqual(cleanup_lines(table[1].t00r0c1), expected_data)
def test_extract_properties(self): filename = "tests/data/properties-table.html" fobj = open(filename, mode="rb") table = rows.import_from_html(fobj, properties=True) self.assertEqual(table.field_names, ["field1", "field2", "properties"]) self.assertEqual( table.field_types, [rows.fields.TextField, rows.fields.TextField, rows.fields.JSONField], ) properties_1 = {"class": "some-class another-class", "data-test": "value"} properties_2 = {"class": "css-class", "data-test": "value2"} self.assertEqual(len(table), 2) self.assertEqual(table[0].field1, "row1field1") self.assertEqual(table[0].field2, "row1field2") self.assertEqual(table[0].properties, properties_1) self.assertEqual(table[1].field1, "row2field1") self.assertEqual(table[1].field2, "row2field2") self.assertEqual(table[1].properties, properties_2)
def router_images(router_id): html = _router_detail(router_id) table = rows.import_from_html(BytesIO(html), index=1, preserve_html=True) fields = OrderedDict([('date', rows.fields.DateField), ('filename', rows.fields.TextField), ('url', rows.fields.TextField), ('size', rows.fields.TextField), ('description', rows.fields.TextField)]) def transform(row, table): file_data = tag_to_dict(row.filename) absolute_url = url_join(URL_ROUTER_SEARCH, url_quote(file_data['href'])) return {'date': extract_text(row.date), 'description': extract_text(row.description), 'filename': file_data['text'], 'size': extract_text(row.size), 'url': absolute_url, } return rows.transform(fields, transform, table)
def test_extract_properties(self): filename = 'tests/data/properties-table.html' fobj = open(filename, mode='rb') table = rows.import_from_html(fobj, properties=True) self.assertEqual(table.field_names, ['field1', 'field2', 'properties']) self.assertEqual(table.field_types, [rows.fields.TextField, rows.fields.TextField, rows.fields.JSONField]) properties_1 = {'class': 'some-class another-class', 'data-test': 'value', } properties_2 = {'class': 'css-class', 'data-test': 'value2', } self.assertEqual(len(table), 2) self.assertEqual(table[0].field1, 'row1field1') self.assertEqual(table[0].field2, 'row1field2') self.assertEqual(table[0].properties, properties_1) self.assertEqual(table[1].field1, 'row2field1') self.assertEqual(table[1].field2, 'row2field2') self.assertEqual(table[1].properties, properties_2)
def test_nested_tables_first_inner(self): filename = "tests/data/nested-table.html" fobj = open(filename, mode="rb") table = rows.import_from_html(fobj, index=1) self.assertEqual(set(table.fields.keys()), set(["t01r0c0", "t01r0c1"])) self.assertEqual(len(table), 5) self.assertEqual(table[0].t01r0c0, "t0,1r1c0") self.assertEqual(table[0].t01r0c1, "t0,1r1c1") self.assertEqual(table[1].t01r0c0, "t0,1r2c0") self.assertEqual(table[1].t01r0c1, "t0,1r2c1") inner_table = "t0,2r0c0 t0,2r0c1 t0,2r1c0 t0,2r1c1" self.assertEqual(table[2].t01r0c0, inner_table) self.assertEqual(table[2].t01r0c1, "t0,1r3c1") self.assertEqual(table[3].t01r0c0, "t0,1r4c0") self.assertEqual(table[3].t01r0c1, "t0,1r4c1") self.assertEqual(table[4].t01r0c0, "t0,1r5c0") self.assertEqual(table[4].t01r0c1, "t0,1r5c1")
def test_nested_tables_outer(self): filename = 'tests/data/nested-table.html' fobj = open(filename) table = rows.import_from_html(fobj) self.assertEqual(set(table.fields.keys()), set(['t00r0c0', 't00r0c1', 't00r0c2'])) self.assertEqual(len(table), 3) self.assertEqual(table[0].t00r0c0, 't0,0r1c0') self.assertEqual(table[0].t00r0c1, 't0,0r1c1') self.assertEqual(table[0].t00r0c2, 't0,0r1c2') inner_table = ('t0,1r0c0 t0,1r0c1 t0,1r1c0 t0,1r1c1 t0,1r2c0 ' 't0,1r2c1 t0,2r0c0 t0,2r0c1 t0,2r1c0 t0,2r1c1 ' 't0,1r3c1 t0,1r4c0 t0,1r4c1 t0,1r5c0 t0,1r5c1').split() self.assertEqual(table[1].t00r0c0, 't0,0r2c0') self.assertEqual(cleanup_lines(table[1].t00r0c1), inner_table) self.assertEqual(table[1].t00r0c2, 't0,0r2c2') self.assertEqual(table[2].t00r0c0, 't0,0r3c0') self.assertEqual(table[2].t00r0c1, 't0,0r3c1') self.assertEqual(table[2].t00r0c2, 't0,0r3c2')