def many_sr(part_of_codes, lock): # global LOCK lock.acquire() try: driver = webdriver.Safari() finally: lock.release() first_page = 'http://beboo.ru/search?iaS=0&status=all&country={}®ion=all&town=all&lookFor=0' for code in part_of_codes: driver.get(first_page.format(code)) users_links_by_country = list() page = 1 users_last_page = [] while True: users_this_page = [] lock.acquire() try: information( 'Work with {0} region, {1} page, already scrapped {2} profiles by {0}.' .format(code, page, page * 15)) finally: lock.release() selenium_users_on_page = driver.find_elements_by_class_name( 'user-link') for selenium_user in selenium_users_on_page: users_this_page.append( (selenium_user.get_attribute('href'), code)) if users_this_page == users_last_page: driver.close() set_users_links_by_country = set(users_links_by_country) lock.acquire() try: my_csv.csv_data_writer(PATH_TO_USERS_LINKS, list(set_users_links_by_country)) information( 'Work with {} region finished. {} links scrapped from country' .format(code, len(users_links_by_country))) finally: lock.release() break else: page += 1 # USERS_INFORMATION.extend(users_this_page) users_links_by_country.extend(users_this_page) driver.get( 'http://beboo.ru/search?iaS=0&status=all&country={}®ion=all&town=all&lookFor=0&page={}' .format(code, page)) users_last_page = users_this_page.copy() pass
def get_links(first_page, country): global ALL_LINKS driver = webdriver.Chrome(PATH_TO_DRIVER) driver.get(first_page) users_links_by_country = list() # users_links_by_country = set() page = 1 users_last_page = [] while True: users_this_page = [] information( 'Work with {0} region, {1} page, already scrapped {2} profiles by {0}, from other regions {3}.' .format(country, page, page * 15, ALL_LINKS)) selenium_users_on_page = driver.find_elements_by_class_name( 'user-link') for selenium_user in selenium_users_on_page: users_this_page.append( (selenium_user.get_attribute('href'), country)) if users_this_page == users_last_page: driver.close() set_users_links_by_country = set(users_links_by_country) my_csv.csv_data_writer(PATH_TO_USERS_LINKS, list(set_users_links_by_country)) information( 'Work with {} region finished. {} links scrapped from country'. format(country, len(users_links_by_country))) ALL_LINKS += len(users_links_by_country) break else: page += 1 # USERS_INFORMATION.extend(users_this_page) users_links_by_country.extend(users_this_page) driver.get( 'http://beboo.ru/search?iaS=0&status=all&country={}®ion=all&town=all&lookFor=0&page={}' .format(country, page)) users_last_page = users_this_page.copy() pass
def scr_user(dr, number_user, link_to_user, region_code): global GRAY_USERS, WHITE_USERS, BLACK_USERS, PATH_TO_USERS_INFORMATION, PATH_TO_TEMP_DIR dr.get(link_to_user) # if dr.current_url == '': # dr = auth(dr) try: dr.find_element_by_class_name('info-404') BLACK_USERS += 1 except selenium_exception.NoSuchElementException: WHITE_USERS += 1 id_user = link_to_user.replace('http://beboo.ru/profile/', '').replace('?from=1', '') try: name = find_by('bad_username', 'profile-nick-name', dr).partition('\n')[0] sex = find_by('id', 'val_age', dr) type_of_account = find_by('tag_1', 'dd', dr) age = find_by('tag_2', 'dd', dr) country = region_code # country = '1' # сделать подстановку из словаря, ищем по строке # driver.find_element_by_class_name('look_for').text и если совпадает со страной из списка, подставляем её city = find_by('tag_4', 'dd', dr) about = find_by('tag_6', 'p', dr).replace('\n', '; ') find = find_by('tag_7', 'p', dr).replace('\n', '; ') # about # additionally = dr.find_element_by_name('advTab') # additionally.click() family = find_by('id', 'val_25', dr) # семейное положение profit = find_by('id', 'val_26', dr) # доход financial_situation = find_by('id', 'val_27', dr) # материальное положение accommodation = find_by('id', 'val_28', dr) # проживание auto = find_by('id', 'val_29', dr) # наличие автомобиля smoke = find_by('id', 'val_30', dr) # отношение к курению alco = find_by('id', 'val_31', dr) # отношение к алкоголю language = find_by('id', 'val_32', dr) # знание языков # look height = find_by('id', 'val_height', dr).replace(' см', '') # рост, см weight = find_by('id', 'val_weight', dr).replace(' кг', '') # вес, кг head_color = find_by('id', 'val_23', dr) # цвет волос eye_color = find_by('id', 'val_24', dr) # цвет глаз body = find_by('id', 'val_19', dr) # телосложение tatoo = find_by('id', 'val_20', dr) # татуировки piercing = find_by('id', 'val_21', dr) # пирсинг !!! other_hair = find_by('id', 'val_22', dr) # волосы на лице и на теле # sexual preferences orientation = find_by('id', 'val_33', dr) # ориентация type_of_sex = find_by('id', 'val_35', dr) # тип секса role_of_sex = find_by('id', 'val_34', dr) # роль favorite_poses_in_sex = find_by('id', 'val_36', dr) # позы to_do = find_by('id', 'val_37', dr) # действия !!! erogenous_zones = find_by('id', 'val_38', dr) # эрогенные зоны fetishes = find_by('id', 'val_39', dr) # фетиши except selenium_exception.NoSuchElementException: time.sleep(300) scr_user(dr, number_user, link_to_user, region_code) my_csv.csv_line_writer(PATH_TO_USERS_INFORMATION, [ id_user, link_to_user, name, sex, type_of_account, age, country, city, about, find, family, profit, financial_situation, accommodation, auto, smoke, alco, language, height, weight, head_color, eye_color, body, tatoo, piercing, other_hair, orientation, type_of_sex, role_of_sex, favorite_poses_in_sex, to_do, erogenous_zones, fetishes ]) users_photos_links = get_links_to_photo(dr) [this.insert(0, id_user) for this in users_photos_links] # add user id to photo # save photos links for index in range(len(users_photos_links)): save_photo_links(users_photos_links[index]) my_csv.csv_data_writer(PATH_TO_TEMP_DIR + '/{}.scv'.format(id_user), users_photos_links) information( 'User {} (user id is {}) is scrapped with photo(-s). Now {}/{} users done ({}%), {} is dead.' .format( number_user + 1, id_user, WHITE_USERS, GRAY_USERS, round(((number_user + 1 + BLACK_USERS) / GRAY_USERS) * 100, 2), BLACK_USERS)) pass pass