Example #1
0

def get_obj_links_from_page(page):
    page = html.document_fromstring(page)
    links = page.cssselet('')

    #https://www.avito.ru/user/e1b47b49f2ed83208fd34f902ad9cfb5/profile/items?shortcut=active&offset=0&limit=16
    #https://www.avito.ru/user/e1b47b49f2ed83208fd34f902ad9cfb5/profile/items?shortcut=closed&offset=0&limit=16


if __name__ == '__main__':

    curIndex = pt.CurIndex(os.path.join(cs.path_data, 'cur_user_id.txt'))
    cur_user_id = curIndex.get()

    c, cu = cs.get_db()

    sql = "SELECT * FROM `user` WHERE `user_id` > ?"
    res = cu.execute(sql, (cur_user_id, )).fetchall()

    for user in res:

        if not user['user_url']: continue

        if user['user_url'].startswith('/user/'):
            #url = user['user_url'].split('?')[0]

            url = user['user_url'] + '/items?shortcut=closed&offset=0&limit=16'
            r = requests.get(cs.domain_url + url)
            json = r.json()
cli_parser.add_argument('--only-empty', default=None)

cli_args = cli_parser.parse_args()

#TODO collect info about publicator (fio, name of organization)


def func_compare_publ_date(old, new):
    return old.split()[0] == new.split()[0]


if __name__ == '__main__':
    curIndex = pt.CurIndex(os.path.join(cs.path_data, 'cur_obj_id.txt'))
    cur_obj_id = curIndex.get()

    c, cu = cs.get_db()
    c2, cu2 = cs.get_db()
    proxies = pt.Proxies(c, cu, 'avito', cs.path_data)
    history = pt.DBHistory(c2, cu2)

    #sql = "SELECT `realty_price`, `realty_id`, `realty_url`, `realty_ext_id` FROM `realty` WHERE `realty_id`>=? AND `realty_is_redirect` IS NULL AND `site`=? "
    sql = "SELECT `realty_price`, `realty_id`, `realty_url`, `realty_ext_id` FROM `realty` WHERE `realty_id`>=? AND `site`=? "
    if cli_args.only_empty: sql += " AND `realty_price` is NULL "
    res = cu.execute(sql, (cur_obj_id, cs.site_id)).fetchall()
    for obj in res:
        print(obj['realty_id'])
        print(obj['realty_url'], end='  ')
        #TODO if status_code is 404, the object is unpablished. Add the sirow into `realty_status` table
        # and we need to got into the publicator's pager and try to see the object in 'finished' list of objects

        # -------- getting page