def test_make_df_validate(): table = parser.HTMLTableParser(HTML, 1) columns = [ parser.DataColumn(i, {0: RESULT1[0][i]}, parser.div_parser) for i in range(5) ] df = pd.DataFrame(DF_DATA[1:]) assert df.equals(table.make_df(columns, 1))
def test_get_formatted_data(): table = parser.HTMLTableParser(HTML, 1) columns = [ parser.DataColumn(f"col_{i}", i, {}, lambda x: x) for i in range(5) ] df = pd.DataFrame(RESULT1) df.columns = [f"col_{i}" for i in range(5)] rez = df.to_dict("records") assert rez == table.get_formatted_data(columns)
def test_get_formatted_data_validate(): table = parser.HTMLTableParser(HTML, 1) columns = [ parser.DataColumn(f"col_{i}", i, {0: RESULT1[0][i]}, parser.div_parser) for i in range(5) ] df = pd.DataFrame(DF_DATA[1:]) df.columns = [f"col_{i}" for i in range(5)] rez = df.to_dict("records") assert rez == table.get_formatted_data(columns, 1)
def test_get_formatted_data_with_parsed_data(): table = parser.HTMLTableParser(HTML, 1) columns = [ parser.DataColumn(f"col_{i}", i, {}, parser.div_parser) for i in range(5) ] df = pd.DataFrame(DF_DATA) df.columns = [f"col_{i}" for i in range(5)] rez = df.to_dict("records") assert rez == table.get_formatted_data(columns)
async def _download(self, name: str): html = await get_html(name) table = parser.HTMLTableParser(html, TABLE_INDEX) columns = [DATE_COLUMN] if is_common(name): columns.append(COMMON_COLUMN) else: columns.append(PREFERRED_COLUMN) df = table.make_df(columns, HEADER_SIZE) df.dropna(inplace=True) df.columns = [DATE, name] df = df.groupby(DATE, as_index=False).sum() df.set_index(DATE, inplace=True) df.sort_index(inplace=True) return df[name]
def _download(self, item: str, last_index: Optional[Any]) -> List[Dict[str, Any]]: if item != SMART_LAB: raise POptimizerError( f"Отсутствуют данные {self._mongo.collection.full_name}.{item}" ) with self._session.get(URL) as respond: try: respond.raise_for_status() except requests.HTTPError: raise POptimizerError(f"Данные {URL} не загружены") else: html = respond.text table = parser.HTMLTableParser(html, TABLE_INDEX) columns = [TICKER_COLUMN, DATE_COLUMN, DIVIDENDS_COLUMN] return table.get_formatted_data(columns, HEADER_SIZE, FOOTER_SIZE)
def _download(self, item: str, last_index: Optional[Any]) -> List[Dict[str, Any]]: url = f"https://www.dohod.ru/ik/analytics/dividend/{item.lower()}" with self._session.get(url) as respond: try: respond.raise_for_status() except requests.HTTPError: raise POptimizerError(f"Данные {url} не загружены") else: html = respond.text table = parser.HTMLTableParser(html, TABLE_INDEX) date_col = parser.DataColumn( DATE, 0, {0: "Дата закрытия реестра"}, parser.date_parser ) div_col = parser.DataColumn(item, 2, {0: "Дивиденд (руб.)"}, parser.div_parser) columns = [date_col, div_col] data = table.get_formatted_data(columns, HEADER_SIZE) return sort_and_group(item, data)
async def _download(self, name: str): url = f"https://www.dohod.ru/ik/analytics/dividend/{name.lower()}" async with aiohttp.ClientSession() as session: async with session.get(url) as resp: try: resp.raise_for_status() except aiohttp.ClientResponseError: raise POptimizerError(f"Данные {url} не загружены") else: html = await resp.text() table = parser.HTMLTableParser(html, TABLE_INDEX) columns = [DATE_COLUMN, DIVIDENDS_COLUMN] df = table.make_df(columns, HEADER_SIZE) df.columns = [DATE, name] df = df.groupby(DATE, as_index=False).sum() df.set_index(DATE, inplace=True) df.sort_index(inplace=True) return df[name]
def _download(self, item: str, last_index: Optional[Any]) -> List[Dict[str, Any]]: while True: try: html = asyncio.run(get_html(item)) except asyncio.TimeoutError: continue else: break # noinspection PyUnboundLocalVariable table = parser.HTMLTableParser(html, TABLE_INDEX) date_column = parser.DataColumn( DATE, 5, { 1: "Дата закрытия реестра акционеров", 2: "Под выплату дивидендов" }, parser.date_parser, ) columns = [date_column] common_column = parser.DataColumn(item, 7, { 1: "Размер дивидендов", 2: "АОИ" }, parser.div_parser) preferred_column = parser.DataColumn(item, 8, { 1: "Размер дивидендов", 2: "АПИ" }, parser.div_parser) if is_common(item): columns.append(common_column) else: columns.append(preferred_column) data = table.get_formatted_data(columns, HEADER_SIZE) data = [row for row in data if row[DATE] is not None] return dohod.sort_and_group(item, data)
def test_get_formatted_data_raise_validate(): table = parser.HTMLTableParser(HTML, 1) columns = [parser.DataColumn("col", 1, {0: "2.2", 1: "test"}, lambda x: x)] with pytest.raises(POptimizerError) as error: table.get_formatted_data(columns) assert error.value == 'Значение в таблице "5.55 (сов)" - должно быть "test"'
def test_make_df_fail_validate(): table = parser.HTMLTableParser(HTML, 1) columns = [parser.DataColumn(1, {0: "2.2", 1: "test"}, lambda x: x)] with pytest.raises(POptimizerError) as error: table.make_df(columns) assert error.value == 'Значение в таблице "5.55 (сов)" - должно быть "test"'
def test_make_df_drop(): table = parser.HTMLTableParser(HTML, 1) columns = [parser.DataColumn(i, {}, parser.div_parser) for i in range(5)] df = pd.DataFrame(DF_DATA[1:2]) assert df.equals(table.make_df(columns, 1, 1))
def test_fast_second_parse(): table = parser.HTMLTableParser(HTML, 1) assert table.parsed_table == RESULT1 assert table.parsed_table == RESULT1
def test_no_table(): with pytest.raises(POptimizerError) as error: parser.HTMLTableParser(HTML, 2) assert error.value == "На странице нет таблицы 2"
def test_parse_no_tbody(): table = parser.HTMLTableParser(HTML, 1) assert table.parsed_table == RESULT1
def test_make_df_with_parsed_data(): table = parser.HTMLTableParser(HTML, 1) columns = [parser.DataColumn(i, {}, parser.div_parser) for i in range(5)] df = pd.DataFrame(DF_DATA) assert df.equals(table.make_df(columns))
def test_make_df(): table = parser.HTMLTableParser(HTML, 1) columns = [parser.DataColumn(i, {}, lambda x: x) for i in range(5)] df = pd.DataFrame(RESULT1) assert df.equals(table.make_df(columns))