def get_result_task(count_page, url_task): """ Получаем данные для одного задания (ссылки со всеми пагинациями) :param count_page: :param url_task: :return: """ next_pagination = True result = [] for i in range(1, int(count_page) + 1): if next_pagination: # Проверяем нужно ли парсить следующие страницы log.info('Parsing page# ' + str(i) + ' of ' + str(count_page)) page_url = url_task + '&p=' + str(i) try: page_data = get_page_data(page_url, 1) except: page_data = [], True error_message = 'Error get_page_data' + '\n ' + page_url text_handler(EXEPTION_CHAT, error_message) log.error(error_message) result += page_data[0] next_pagination = page_data[1] time_sleep() else: break return result
def send_mes_to_bot(item): """ Подготавливаем текст для телеграм бота :param item: :return: """ from main import log first_row = '' # ID second_row = '' # Name third_row = '' # price fours_row = '' # price_history five_row = '' # address six_row = '' # params seven_row = '' # url if item['type_update'] == 'update': if item['item_price'] >= [(item['sql_price'], )]: first_row = 'Обновилась цена id ' + str(item['sql_avito_id']) + ' ' + \ emoji_down + emoji_down + emoji_top_green + '\n\n' third_row = 'Старая цена = ' + str(num_conversion(item['old_price'])) + ' руб. /' + \ ' Новая цена = ' + str(num_conversion(item['sql_price'])) + ' руб.\n\n' else: first_row = 'Обновилась цена id ' + str(item['sql_avito_id']) + ' ' + \ emoji_top + emoji_top + emoji_down_red + '\n\n' third_row = 'Старая цена = ' + str(num_conversion(item['item_price'][0][0])) + ' руб. /'+ \ ' Новая цена = ' + str(num_conversion(item['sql_price'])) + ' руб.\n\n' fours_row = 'Изменения цен \n' + str(item['price_history_srt']) + '\nРазница: ' + \ item['difference_price'] + ' (' + item['percent_difference_price'] + '%)\n\n' elif item['type_update'] == 'new': first_row = 'Новое объявление ' + str(item['sql_avito_id']) + '\n\n' third_row = 'Цена: ' + str(item['sql_price']) + ' руб.\n\n' else: log.error('type_update = NONETYPE ' + str(item['sql_avito_id'])) second_row = str(item['sql_name']) + '\n\n' five_row = 'Адрес: ' + str(item['sql_address']) + '\n\n' six_row = 'Параметры: ' + str(item['sql_params']) + '\n\n' seven_row = 'Ссылка ' + str(item['sql_url']) + '\n\n' none_type_of = [ 'Личные вещи', 'Работа', 'Для дома и дачи', 'Предложение услуг', 'Электроника', 'Животные', 'Готовый бизнес и оборудование' ] if item['sql_type_of'] == 'Недвижимость': mes_to_bot = first_row + third_row + fours_row + five_row + seven_row elif item['sql_type_of'] == 'Транспорт': mes_to_bot = first_row + second_row + third_row + fours_row + six_row + seven_row elif item['sql_type_of'] == 'Хобби и отдых': mes_to_bot = first_row + second_row + third_row + fours_row + five_row + six_row + seven_row elif item['sql_type_of'] in none_type_of: mes_to_bot = first_row + second_row + third_row + fours_row + seven_row else: log.error('sql_type_of = NONETYPE ' + str(item['sql_avito_id'])) mes_to_bot = 'sql_type_of = NONETYPE ' + str(item['sql_avito_id']) text_handler(item['sql_chat'], mes_to_bot)
def get_soup_from_page(page_url, count_try): """ Получаем SOUP для любой страницы :param page_url: :param count_try: :return: """ session = get_session() r = session.get(page_url) next_parsing = True if r.status_code == 403: error_message = 'Error: ' + str( r.status_code) + ' \nTime to sleep. Exit.' text_handler(EXEPTION_CHAT, error_message) log.error(error_message) soup = None next_parsing = False elif r.status_code == 429 and count_try < 2: error_message = 'Error: ' + str( r.status_code ) + ' \nToo many request. Sleep 10min. \nTry № ' + str( count_try) + '\n' + str(page_url) text_handler(EXEPTION_CHAT, error_message) log.error(error_message) time.sleep(600) soup = get_soup_from_page(page_url, count_try + 1) elif r.status_code == 429 and count_try < 4: error_message = 'Error: ' + str( r.status_code ) + ' \nToo many request. Sleep 15min. \nTry № ' + str( count_try) + '\n' + str(page_url) text_handler(EXEPTION_CHAT, error_message) log.error(error_message) time.sleep(900) soup = get_soup_from_page(page_url, count_try + 1) elif r.status_code != 200 and count_try < 4: error_message = 'Error: ' + str( r.status_code) + ' Try № ' + str(count_try) + '\n' + str(page_url) text_handler(EXEPTION_CHAT, error_message) log.error(error_message) time_sleep(get_random_time()) soup = get_soup_from_page(page_url, count_try + 1) elif count_try > 4: error_message = 'Error: ' + str(r.status_code) + ' Try ended' text_handler(EXEPTION_CHAT, error_message) log.warn(error_message) soup = None else: soup = BeautifulSoup(r.text, 'html.parser') return soup, next_parsing
def get_count_page(soup, url_task): """ Получаем список страниц пагинации :param soup: :param url_task: :return: """ try: pagination = soup.find('div', {"data-marker": "pagination-button"}) pagination.find('span', {"data-marker": "pagination-button/prev"}).decompose() pagination.find('span', {"data-marker": "pagination-button/next"}).decompose() count_page = pagination.find_all('span')[-1].text except: count_page = 1 error_message = 'Error pagination' + '\n ' + url_task text_handler(EXEPTION_CHAT, error_message) log.error(error_message) return count_page
def get_page_rows(soup, type_of): """ Получаем таблицу с объявлениями :param soup: :param type_of: :return: """ table = soup.find('div', {"data-marker": "catalog-serp"}) if table: # Удаляем рекламные блоки if table.find('div', {"data-marker": "witcher/block"}): table.find('div', {"data-marker": "witcher/block"}).decompose() rows = table.find_all('div', {"data-marker": "item"}) result = get_item_data(rows, type_of) else: error_message = 'Error not table' + str(soup) + str(table) log.error(error_message) text_handler(EXEPTION_CHAT, 'Error not table// Check LOGS') result = [] return result
def get_page_data(page_url, count_try): """ Получаем страницу с объявлениями :param page_url: :param count_try: :return: """ next_pagination = True soup = get_soup_from_page(page_url, count_try) result = [] if not soup[1]: error_message = 'Next parsing none ' + str(page_url) log.error(error_message) text_handler(EXEPTION_CHAT, error_message) return result, False if not soup[0]: error_message = 'Soup is None ' + str(page_url) log.error(error_message) text_handler(EXEPTION_CHAT, error_message) return result, False try: type_of = soup[0].find('div', {"data-marker": "breadcrumbs"}).find_all('span', {"itemprop": "itemListElement"})[ 1].find('a').text except: type_of = 'None Type' log.warn('type_of = None Type') if soup[0].find_all('div', attrs={"class": re.compile(r"items-items")}): if len(soup[0].find_all('div', attrs={"class": re.compile(r"items-items")})) > 1: log.warn('Found another offers | Break pagination ' + str(page_url)) next_pagination = False try: result = get_page_rows(soup[0], type_of) except: error_message = 'Error get_page_rows' + '\n ' + page_url text_handler(EXEPTION_CHAT, error_message) log.error(error_message) return result, next_pagination
def write_sqlite3(url): """ Записываем данные в SQLite :param url: :return: """ items = [] sql_city = url[1][0] sql_chat = url[1][1] sql_urls_id = url[1][2] conn = sqlite3.connect(ROUTE_DB) with conn: cur = conn.cursor() cur.execute('UPDATE offers SET status=0 WHERE urls_id=?', (sql_urls_id, )) # Обнуляем у всех объявлений статус for i in range(0, len(url[0])): if url[0][i] is not None: sql_avito_id = url[0][i]['avito_id'] sql_name = url[0][i]['name'] sql_price = url[0][i]['price'] sql_address = url[0][i]['address'] sql_url = url[0][i]['url'] sql_type_of = url[0][i]['type_of'] sql_params = url[0][i]['params'] price_history = [] price_now = { "data": str(get_date_time()), "price": str(sql_price) } cur.execute('SELECT avito_id FROM offers WHERE avito_id=?', (sql_avito_id, )) item_id = cur.fetchall() if item_id == [ (sql_avito_id, ) ]: # Ищем ID в бд, и если не находим то пишем сообщение в телегу cur.execute('SELECT price FROM offers WHERE avito_id=?', (sql_avito_id, )) item_price = cur.fetchall() old_price = item_price[0][0] cur.execute( 'SELECT price_history FROM offers WHERE avito_id=?', (sql_avito_id, )) price_history = json.loads(cur.fetchall()[0][0]) price_history.append(price_now) price_history_dumps = json.dumps(price_history) price_history_srt = '' start_count = 0 if len(price_history) > 0: if len(price_history) > 9: start_count = len(price_history) - 9 for i in range(start_count, len(price_history)): if i == 0: price_history_srt = price_history_srt + \ 'Дата: ' + int(price_history[i]['data']) + ' ' + \ 'Цена: ' + num_conversion(int(price_history[i]['price'])) + ' руб.\n' else: percent_price_history = calculation_percent( int(price_history[i - 1]['price']), int(price_history[i]['price'])) price_history_srt = price_history_srt + \ 'Дата: ' + int(price_history[i]['data']) + ' ' + \ 'Цена: ' + num_conversion(int(price_history[i]['price'])) + ' руб. ' + \ '(' + percent_price_history + '%)\n' difference_price = calculation_different_price( int(price_history[0]['price']), int(price_now['price'])) percent_difference_price = calculation_percent( int(price_history[0]['price']), int(price_now['price'])) if item_price == [ (sql_price, ) ]: # Сравниваем цены, и если есть отличие то обновляем их cur.execute( "UPDATE offers SET status=1, updated_date=?,urls_id=?, type_of=?, params=? WHERE avito_id=?", (str(get_date_time()), sql_urls_id, sql_type_of, sql_params, sql_avito_id)) continue else: items.append({ 'item_price': item_price, 'sql_chat': sql_chat, 'sql_avito_id': sql_avito_id, 'sql_name': sql_name, 'old_price': old_price, 'sql_price': sql_price, 'price_history_srt': price_history_srt, 'difference_price': difference_price, 'percent_difference_price': percent_difference_price, 'sql_address': sql_address, 'sql_url': sql_url, 'sql_params': sql_params, 'sql_type_of': sql_type_of, 'type_update': 'update' }) cur.execute( "UPDATE offers SET price=?, old_price=?, updated_date=?, price_history=?, status=1, urls_id=?, type_of=?, params=? WHERE avito_id=?", (sql_price, old_price, str(get_date_time()), str(price_history_dumps), sql_urls_id, sql_type_of, sql_params, sql_avito_id)) log.info('Price update | ' + str(sql_avito_id)) else: items.append({ 'item_price': None, 'sql_chat': sql_chat, 'sql_avito_id': sql_avito_id, 'sql_name': sql_name, 'old_price': None, 'sql_price': sql_price, 'price_history_srt': None, 'difference_price': None, 'percent_difference_price': None, 'sql_address': sql_address, 'sql_url': sql_url, 'sql_params': sql_params, 'sql_type_of': sql_type_of, 'type_update': 'new' }) log.info('No ID -> New Offer | ' + str(sql_avito_id)) price_history.append(price_now) price_history_dumps = json.dumps(price_history) cur.execute( "INSERT OR IGNORE INTO offers ('avito_id','name','price','price_history','address','url','created_date','updated_date','status','city','urls_id','type_of','params') VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?)", (sql_avito_id, sql_name, sql_price, str(price_history_dumps), sql_address, sql_url, str(get_date_time()), str(get_date_time()), 1, sql_city, sql_urls_id, sql_type_of, sql_params)) else: error_message = 'Error: write Sql_item, item is None ' + str( sql_urls_id) text_handler(EXEPTION_CHAT, error_message) log.error(error_message) parse_items_to_send(items) conn.commit() conn.close()