Exemple #1
0
def many_sr(part_of_codes, lock):
    # global LOCK
    lock.acquire()
    try:
        driver = webdriver.Safari()
    finally:
        lock.release()
    first_page = 'http://beboo.ru/search?iaS=0&status=all&country={}&region=all&town=all&lookFor=0'

    for code in part_of_codes:
        driver.get(first_page.format(code))

        users_links_by_country = list()

        page = 1
        users_last_page = []
        while True:
            users_this_page = []
            lock.acquire()
            try:
                information(
                    'Work with {0} region, {1} page, already scrapped {2} profiles by {0}.'
                    .format(code, page, page * 15))
            finally:
                lock.release()
            selenium_users_on_page = driver.find_elements_by_class_name(
                'user-link')
            for selenium_user in selenium_users_on_page:
                users_this_page.append(
                    (selenium_user.get_attribute('href'), code))

            if users_this_page == users_last_page:
                driver.close()
                set_users_links_by_country = set(users_links_by_country)
                lock.acquire()
                try:
                    my_csv.csv_data_writer(PATH_TO_USERS_LINKS,
                                           list(set_users_links_by_country))
                    information(
                        'Work with {} region finished. {} links scrapped from country'
                        .format(code, len(users_links_by_country)))
                finally:
                    lock.release()
                break
            else:
                page += 1
                # USERS_INFORMATION.extend(users_this_page)
                users_links_by_country.extend(users_this_page)
                driver.get(
                    'http://beboo.ru/search?iaS=0&status=all&country={}&region=all&town=all&lookFor=0&page={}'
                    .format(code, page))
                users_last_page = users_this_page.copy()
    pass
Exemple #2
0
def get_links(first_page, country):
    global ALL_LINKS
    driver = webdriver.Chrome(PATH_TO_DRIVER)
    driver.get(first_page)

    users_links_by_country = list()
    # users_links_by_country = set()

    page = 1
    users_last_page = []
    while True:
        users_this_page = []
        information(
            'Work with {0} region, {1} page, already scrapped {2} profiles by {0}, from other regions {3}.'
            .format(country, page, page * 15, ALL_LINKS))
        selenium_users_on_page = driver.find_elements_by_class_name(
            'user-link')
        for selenium_user in selenium_users_on_page:
            users_this_page.append(
                (selenium_user.get_attribute('href'), country))
        if users_this_page == users_last_page:
            driver.close()
            set_users_links_by_country = set(users_links_by_country)
            my_csv.csv_data_writer(PATH_TO_USERS_LINKS,
                                   list(set_users_links_by_country))
            information(
                'Work with {} region finished. {} links scrapped from country'.
                format(country, len(users_links_by_country)))
            ALL_LINKS += len(users_links_by_country)
            break
        else:
            page += 1
            # USERS_INFORMATION.extend(users_this_page)
            users_links_by_country.extend(users_this_page)
            driver.get(
                'http://beboo.ru/search?iaS=0&status=all&country={}&region=all&town=all&lookFor=0&page={}'
                .format(country, page))
            users_last_page = users_this_page.copy()

    pass
def scr_user(dr, number_user, link_to_user, region_code):
    global GRAY_USERS, WHITE_USERS, BLACK_USERS, PATH_TO_USERS_INFORMATION, PATH_TO_TEMP_DIR
    dr.get(link_to_user)

    # if dr.current_url == '':
    #     dr = auth(dr)

    try:
        dr.find_element_by_class_name('info-404')
        BLACK_USERS += 1

    except selenium_exception.NoSuchElementException:
        WHITE_USERS += 1
        id_user = link_to_user.replace('http://beboo.ru/profile/',
                                       '').replace('?from=1', '')

        try:
            name = find_by('bad_username', 'profile-nick-name',
                           dr).partition('\n')[0]
            sex = find_by('id', 'val_age', dr)
            type_of_account = find_by('tag_1', 'dd', dr)
            age = find_by('tag_2', 'dd', dr)
            country = region_code
            # country = '1'  # сделать подстановку из словаря, ищем по строке
            # driver.find_element_by_class_name('look_for').text и если совпадает со страной из списка, подставляем её
            city = find_by('tag_4', 'dd', dr)
            about = find_by('tag_6', 'p', dr).replace('\n', '; ')
            find = find_by('tag_7', 'p', dr).replace('\n', '; ')

            # about
            # additionally = dr.find_element_by_name('advTab')
            # additionally.click()
            family = find_by('id', 'val_25', dr)  # семейное положение
            profit = find_by('id', 'val_26', dr)  # доход
            financial_situation = find_by('id', 'val_27',
                                          dr)  # материальное положение
            accommodation = find_by('id', 'val_28', dr)  # проживание
            auto = find_by('id', 'val_29', dr)  # наличие автомобиля
            smoke = find_by('id', 'val_30', dr)  # отношение к курению
            alco = find_by('id', 'val_31', dr)  # отношение к алкоголю
            language = find_by('id', 'val_32', dr)  # знание языков

            # look
            height = find_by('id', 'val_height', dr).replace(' см',
                                                             '')  # рост, см
            weight = find_by('id', 'val_weight', dr).replace(' кг',
                                                             '')  # вес, кг
            head_color = find_by('id', 'val_23', dr)  # цвет волос
            eye_color = find_by('id', 'val_24', dr)  # цвет глаз
            body = find_by('id', 'val_19', dr)  # телосложение
            tatoo = find_by('id', 'val_20', dr)  # татуировки
            piercing = find_by('id', 'val_21', dr)  # пирсинг !!!
            other_hair = find_by('id', 'val_22',
                                 dr)  # волосы на лице и на теле

            # sexual preferences
            orientation = find_by('id', 'val_33', dr)  # ориентация
            type_of_sex = find_by('id', 'val_35', dr)  # тип секса
            role_of_sex = find_by('id', 'val_34', dr)  # роль
            favorite_poses_in_sex = find_by('id', 'val_36', dr)  # позы
            to_do = find_by('id', 'val_37', dr)  # действия !!!
            erogenous_zones = find_by('id', 'val_38', dr)  # эрогенные зоны
            fetishes = find_by('id', 'val_39', dr)  # фетиши
        except selenium_exception.NoSuchElementException:
            time.sleep(300)
            scr_user(dr, number_user, link_to_user, region_code)

        my_csv.csv_line_writer(PATH_TO_USERS_INFORMATION, [
            id_user, link_to_user, name, sex, type_of_account, age, country,
            city, about, find, family, profit, financial_situation,
            accommodation, auto, smoke, alco, language, height, weight,
            head_color, eye_color, body, tatoo, piercing, other_hair,
            orientation, type_of_sex, role_of_sex, favorite_poses_in_sex,
            to_do, erogenous_zones, fetishes
        ])

        users_photos_links = get_links_to_photo(dr)

        [this.insert(0, id_user)
         for this in users_photos_links]  # add user id to photo
        #  save photos links
        for index in range(len(users_photos_links)):
            save_photo_links(users_photos_links[index])

        my_csv.csv_data_writer(PATH_TO_TEMP_DIR + '/{}.scv'.format(id_user),
                               users_photos_links)

        information(
            'User {} (user id is {}) is scrapped with photo(-s). Now {}/{} users done ({}%), {} is dead.'
            .format(
                number_user + 1, id_user, WHITE_USERS, GRAY_USERS,
                round(((number_user + 1 + BLACK_USERS) / GRAY_USERS) * 100, 2),
                BLACK_USERS))
        pass
    pass