def get_obj_links_from_page(page): page = html.document_fromstring(page) links = page.cssselet('') #https://www.avito.ru/user/e1b47b49f2ed83208fd34f902ad9cfb5/profile/items?shortcut=active&offset=0&limit=16 #https://www.avito.ru/user/e1b47b49f2ed83208fd34f902ad9cfb5/profile/items?shortcut=closed&offset=0&limit=16 if __name__ == '__main__': curIndex = pt.CurIndex(os.path.join(cs.path_data, 'cur_user_id.txt')) cur_user_id = curIndex.get() c, cu = cs.get_db() sql = "SELECT * FROM `user` WHERE `user_id` > ?" res = cu.execute(sql, (cur_user_id, )).fetchall() for user in res: if not user['user_url']: continue if user['user_url'].startswith('/user/'): #url = user['user_url'].split('?')[0] url = user['user_url'] + '/items?shortcut=closed&offset=0&limit=16' r = requests.get(cs.domain_url + url) json = r.json()
cli_parser.add_argument('--only-empty', default=None) cli_args = cli_parser.parse_args() #TODO collect info about publicator (fio, name of organization) def func_compare_publ_date(old, new): return old.split()[0] == new.split()[0] if __name__ == '__main__': curIndex = pt.CurIndex(os.path.join(cs.path_data, 'cur_obj_id.txt')) cur_obj_id = curIndex.get() c, cu = cs.get_db() c2, cu2 = cs.get_db() proxies = pt.Proxies(c, cu, 'avito', cs.path_data) history = pt.DBHistory(c2, cu2) #sql = "SELECT `realty_price`, `realty_id`, `realty_url`, `realty_ext_id` FROM `realty` WHERE `realty_id`>=? AND `realty_is_redirect` IS NULL AND `site`=? " sql = "SELECT `realty_price`, `realty_id`, `realty_url`, `realty_ext_id` FROM `realty` WHERE `realty_id`>=? AND `site`=? " if cli_args.only_empty: sql += " AND `realty_price` is NULL " res = cu.execute(sql, (cur_obj_id, cs.site_id)).fetchall() for obj in res: print(obj['realty_id']) print(obj['realty_url'], end=' ') #TODO if status_code is 404, the object is unpablished. Add the sirow into `realty_status` table # and we need to got into the publicator's pager and try to see the object in 'finished' list of objects # -------- getting page