def scroll_to_end_by_class_name(driver, class_name, number_requested): """Scroll the active window to the end, where the last element of the given class name become visible. Argument 'number_requested' is used for creating a realistic progress bar """ eles = driver.find_elements_by_class_name(class_name) count = 0 new_count = len(eles) while new_count != count: try: utils.update_progress( new_count / number_requested, f' - Scrolling down to load more items {new_count}/{number_requested}:' ) the_last_in_list = eles[-1] the_last_in_list.location_once_scrolled_into_view time.sleep(random.randint(15, 20) / 10) try: WebDriverWait(driver, timeout=60).until( EC.visibility_of(the_last_in_list)) except TimeoutException: pass count = new_count eles = driver.find_elements_by_class_name(class_name) new_count = len(eles) except TimeoutException: printR(f' Time out while scrolling down. Please retry.') except NoSuchElementException: pass if new_count < number_requested: utils.update_progress( 1, f' - Scrolling down to load more items:{new_count}/{number_requested}' )
def scroll_down_active_page(driver, web_element=None, class_name_to_check='', tag_name_to_check='', xpath_to_check='', number_requested=100, message='', time_out=60): """Scrolling down the active window until all the request items of a given class name or a tag name, are loaded. - The process monitors the change of the page height to decide if another scroll down is needed After a scroll down, if the server fails to load new items within a given time out (default is 60s), the process will stop - If both class name and tag name are given, class name take priority. if none is given, no action is taken - Message is the text shown on the progress bar """ if web_element is None: web_element = driver if class_name_to_check: items = web_element.find_elements_by_class_name(class_name_to_check) elif tag_name_to_check: items = web_element.find_elements_by_tag_name(tag_name_to_check) elif xpath_to_check: items = web_element.find_elements_by_xpath(xpath_to_check) else: printR(' Items were not specified. The process stopped.') return if items is None or len(items) == 0: printR(' No items found.') return if len(items) >= number_requested: return # get the current height of the page last_scroll_height = driver.execute_script( "return document.body.scrollHeight") time_out_count_down = time_out count_sofar = 0 if number_requested == -1: number_requested = config.MAX_NOTIFICATION_REQUEST while count_sofar < number_requested: utils.update_progress( count_sofar / number_requested, f' - Scrolling down {count_sofar}/{number_requested}') # scroll down to bottom driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) new_scroll_height = driver.execute_script( "return document.body.scrollHeight") # give the slow server a chance to load the new items while new_scroll_height == last_scroll_height and time_out_count_down >= 0: time_out_count_down -= 1 #web_element.send_keys(Keys.END) new_scroll_height = driver.execute_script( "return document.body.scrollHeight") time.sleep(1) last_scroll_height = new_scroll_height if class_name_to_check: items = web_element.find_elements_by_class_name( class_name_to_check) elif tag_name_to_check: items = web_element.find_elements_by_tag_name(tag_name_to_check) elif xpath_to_check: items = web_element.find_elements_by_xpath(xpath_to_check) count_sofar = len(items) if count_sofar < number_requested and time_out_count_down <= 0: printR( f'\n Time out ({time_out}s)! {count_sofar}/{number_requested} items obtained. You may try again at another time' ) break # normal termination of while loop: show completed progress bar else: utils.update_progress( 1, f' - Scrolling down {number_requested}/{number_requested}') return items
def scroll_to_end_by_tag_name_within_element(driver, element, tag_name, number_requested, time_out=20): """Scroll the active window to the end, where the last element of the given tag name is loaded and visible. Argument 'number_requested' is used for creating a realistic progress bar """ eles = check_and_get_all_elements_by_tag_name(element, tag_name) count = 0 new_count = len(eles) count_down_timer = time_out while new_count != count: try: utils.update_progress( new_count / number_requested, f' - Scrolling down to load more items {new_count}/{number_requested}:' ) the_last_in_list = eles[-1] the_last_in_list.location_once_scrolled_into_view time.sleep(1) try: WebDriverWait(driver, time_out).until( EC.visibility_of(the_last_in_list)) except TimeoutException: pass count = new_count eles = check_and_get_all_elements_by_tag_name(element, tag_name) new_count = len(eles) # give the slow server a chance to load the new items while new_count == count and count_down_timer >= 0 and new_count < number_requested: utils.update_progress( count_down_timer / time_out, f' - Slow response from server. Counting down {count_down_timer}:' ) count_down_timer -= 1 eles = check_and_get_all_elements_by_tag_name( element, tag_name) new_count = len(eles) the_last_in_list = eles[-1] the_last_in_list.location_once_scrolled_into_view time.sleep(1) except TimeoutException: printR( f' Time out ({time_out}s) while scrolling down. Please retry.' ) except NoSuchElementException: pass if new_count >= number_requested: utils.update_progress( 1, f' - Scrolling down to load more items:{number_requested} / {number_requested}' ) else: # scroll down has finished, but the items obtained are less than requested. Show it utils.update_progress( 1, f' - Scrolling down to load more items:{new_count} / {number_requested}' ) return eles
def scroll_down(driver, scroll_pause_time=0.5, number_of_scrolls=10, estimate_scrolls_needed=3, message=''): """Scrolling down the active window in a controllable fashion. Passing the scroll_pause_time according to the content of the page, to make sure all items are loaded before the next scroll. default is 0.5s The page could have a very long list, or almost infinity, so by default we limit it to 10 times. If number_of_scrolls = 0, return without scrolling If number_of_scrolls = -1, keep scrolling until the end is reached ... for this case, in order to have a realistic progress bar, we will use estimate_scrolls_needed ( total request items / load items per scroll ) Message is a string described the title of the progress bar. if empty is passed, the progress bar will not be stimulated """ if number_of_scrolls == 0: return # Get scroll height last_height = driver.execute_script("return document.body.scrollHeight") iteration_count = 0 scrolls_count_for_stimulated_progressbar = 0 while True: if number_of_scrolls == -1: # if we were able to give an estimate of number of scrolls needed (ex. number of photos, followers, friends are known) if estimate_scrolls_needed != -1: utils.update_progress( scrolls_count_for_stimulated_progressbar / estimate_scrolls_needed, message) # here, we dont know when it ends (for example, we ask for all notifications, but we don't know how many the 500px server will provide) else: notifications_loaded_so_far = scrolls_count_for_stimulated_progressbar * config.NOTIFICATION_PER_LOAD text = f'\r{message} {str(notifications_loaded_so_far)}' sys.stdout.write(text) sys.stdout.flush() elif iteration_count > 0: utils.update_progress(iteration_count / number_of_scrolls, message) scrolls_count_for_stimulated_progressbar += 1 # Scroll down to bottom driver.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # Wait for page to load time.sleep(scroll_pause_time) innerHTML = driver.execute_script( "return document.body.innerHTML" ) #make sure document javascript is executed # exit point #1 : when number of scrolls requested has been reached if number_of_scrolls != -1: iteration_count = iteration_count + 1 if iteration_count >= number_of_scrolls: break # exit point #2: when all items are loaded (by calculating new scroll height and compare with last scroll height) # or when the server stop responding after the given sleep time (scroll_pause_time) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height # mark the end of the progress bar update if number_of_scrolls == -1 and estimate_scrolls_needed == -1: # indeterminate number of scrolls sys.stdout.write('\r\n') # end the progress update with a line-feed sys.stdout.flush() else: utils.update_progress(1, message) # force the display of "100% Done" time.sleep(scroll_pause_time)
def CSV_list_to_HTML_table(csv_file_name, csv_file_type, output_lists, use_local_thumbnails=True, ignore_columns=None, encoding='utf-16', column_to_sort='No', start_indent=1, headline_tag='h4'): """Given a csv file containing a list of items, return a html string containing a description headline and a html table representing items. - we use the csv_type.name as the base text to construct table id, headline id and headline text table id = csv_type.name headline id = [csv_type.name]_headline headline text = TABLE_CAPTION["csv_type.name"] (# of photo) sample of headline string: <h4 id="followers_headline">Followers (1234)</h4> - We hide the columns specified in the given IGNORE_COLUMNS LIST. - Support various types of csv files, which are lists of: notifications, unique_users, like_actors, followers, followings, all_users, reciprocal, not_follow, following - Return saved html file name (the same name as csv file but with extension '.html' ) """ if csv_file_name == '': return '' global HEAD_STRING, TABLE_WIDTHS, TABLE_CAPTION, COL_WIDTHS tab1 = '\n' + '\t' * start_indent tab2 = tab1 + '\t' tab3 = tab2 + '\t' tab4 = tab3 + '\t' tab5 = tab4 + '\t' tab6 = tab5 + '\t' tab7 = tab6 + '\t' tab8 = tab7 + '\t' tab9 = tab8 + '\t' # file extension check file_path, file_extension = os.path.splitext(csv_file_name) if file_extension != ".csv": return '' html_full_file_name = file_path + '.html' if ignore_columns is None: ignore_columns = [] table_id = 'main_table' avatars_folder = os.path.basename( os.path.normpath(output_lists.avatars_dir)) thumbnails_folder = os.path.basename( os.path.normpath(output_lists.thumbnails_dir)) main_table_width = TABLE_WIDTHS[csv_file_type.name] with open(csv_file_name, newline='', encoding=encoding) as csvfile: reader = csv.DictReader(row.replace('\0', '') for row in csvfile) headers = reader.fieldnames if len(headers) < 3: printR(f' File {csv_file_name} is in wrong format!') return '' # write headers and assign appropriate sort method for each columns # # each header cell has 2 parts: the left div for the header name, the right div for sort direction arrows ignore_columns_count = 0 header_string = f'{tab3}<thead>{tab4}<tr>' for i, header in enumerate(reader.fieldnames): if header in ignore_columns: ignore_columns_count += 1 continue col_width = f'width="{COL_WIDTHS[header]}"' header_string += f'''{tab5}<th {col_width}><div class="hdr_text">{header}</div></th>''' header_string += '</tr></thead>' # create rows for html table rows = list(reader) rows_count = len(rows) row_string = f'{tab3}<tbody>' for i, row in enumerate(rows): utils.update_progress( i / rows_count, f' - Writing items to html {i}/{rows_count} ...') row_string += f'{tab4}<tr>' for j in range(len(headers)): col_header = headers[j] # ignore unwanted columns if col_header in ignore_columns: continue text = row[col_header] # In Display Name column, show user's avatar and the display name with link if col_header == 'Display Name': user_home_page = f'https://500px.com/{row["User Name"]}' user_name = row["Display Name"] row_string += f'{tab5}<td><div><div style="width: 30%; float:left;">{tab8}<a href="{user_home_page}" target="_blank">' if use_local_thumbnails: user_avatar = f"{avatars_folder}/{row['Avatar Local']}" else: user_avatar = row['Avatar Href'] row_string += f'{tab9}<img src={user_avatar}></a></div>' row_string += f'{tab7}<div><a href="{user_home_page}" target="_blank">{user_name}</a></div></div></td>' # In Photo Tile column, show photo thumbnail and photo title with <a href> link elif col_header == 'Photo Title': photo_thumbnail = f"{thumbnails_folder}/{row['Photo Thumbnail Local']}" if use_local_thumbnails else row[ 'Photo Thumbnail Href'] # if photo thumbnail is empty, write an empty divs to keep the same layout if (use_local_thumbnails and not row['Photo Thumbnail Local'].strip()) or ( not use_local_thumbnails and not row['Photo Thumbnail Href'].strip()): row_string += f'{tab5}<td><div><div><a/></div><div><a/></div></div></td>' else: photo_link = row['Photo Link'] row_string += f'{tab5}<td><div><div style="width: 30%; float:left;">{tab9}<a href="{photo_link}" target="_blank">' row_string += f'{tab9}<img class="photo" src={photo_thumbnail}></a></div>' row_string += f'{tab7}<div><a href="{photo_link}" target="_blank">{text}</a></div></div></td>' elif col_header == 'Relationship': color_class_name = text.lower().replace(' ', '_') if csv_file_type.name == 'reciprocal' or csv_file_type.name == 'not_follow' or csv_file_type.name == 'following' or \ csv_file_type.name == 'all_users' or csv_file_type.name == 'unique_users' or csv_file_type.name == 'all_unique_users': row_string += f'{tab5}<td class="alignLeft {color_class_name}" >{text}</td>' elif csv_file_type.name == 'notifications' or csv_file_type.name == 'like_actors': if text == 'Following': row_string += f'{tab5}<td class="alignLeft following_raw">{text}</td>' # green cell for following users from notification elif text == 'Not Follow': row_string += f'{tab5}<td class="alignLeft">{text}</td>' # default background color (white) else: row_string += f'{tab5}<td></td>' # empty td cell else: row_string += f'{tab5}<td>{text}</td>' elif col_header == 'Content': row_string += f'{tab5}<td class="alignCenter">{text}</td>' else: row_string += f'{tab5}<td class="alignRight">{text}</td>' row_string += '</tr>' row_string += f'{tab3}</tbody>' # create the main table headline ex: <h4 id="followers_headline">Followers (1234) </h4> headline_text = f'{TABLE_CAPTION[csv_file_type.name]} ({rows_count})' headline_id = f'{csv_file_type.name}_headline' headline_html_string = f'{tab1}<{headline_tag} id="{headline_id}">{headline_text}</{headline_tag}>' table_string = ( f'{tab1}<div class="float_left" style="width:{main_table_width}">' f'{tab2}<table id="{table_id}">' f'{tab3}{header_string}' f'{tab3}{row_string}' f'{tab2}</table>{tab1}</div>') utils.update_progress( 1, f' - Writing items to html {rows_count}/{rows_count} ...') return headline_html_string + table_string
def CSV_photos_list_to_HTML_table(csv_file_name, csv_type, output_lists, use_local_thumbnails=True, ignore_columns=None, start_indent=1, headline_tag='h4'): """ Given a csv file containing a list of photos, return a html string containing a description headline and the photos html table. - we use the csv_type.name as the base text to construct table id, headline id and headline text table id = csv_type.name headline id = csv_type.name_headline headline text = TABLE_CAPTION["csv_type.name"] (# of photo) sample of headline string: <h4 id="unlisted_photos_header">Unlisted Photos (1) ⇩</h4> - We hide the columns specified in the given IGNORE_COLUMNS LIST. (The data in these columns are still being used to form the web link tag <a href=...> """ global TABLE_WIDTHS, TABLE_CAPTION tab1 = '\t' * start_indent tab2 = '\n' + tab1 + '\t' tab3 = tab2 + '\t' tab4 = tab3 + '\t' tab5 = tab4 + '\t' tab6 = tab5 + '\t' tab7 = tab6 + '\t' tab7 = tab6 + '\t' tab8 = tab7 + '\t' if ignore_columns is None: ignore_columns = [] CUSTOMED_COLUMN_WIDTHS = """ <colgroup> <col style="width:4%"> <col style="width:15%"> <col span= "5" style="width:6%" > <col style="width:5%" > <col style="width:8%" > <col style="width:15%"> <col style="width:23%"> </colgroup> """ # file name and extension check file_path, file_extension = os.path.splitext(csv_file_name) if file_extension != ".csv": return None html_file = file_path + '.html' avatars_folder = os.path.basename( os.path.normpath(output_lists.avatars_dir)) thumbnails_folder = os.path.basename( os.path.normpath(output_lists.thumbnails_dir)) table_width = TABLE_WIDTHS[f'{csv_type.name}'] with open(csv_file_name, newline='', encoding='utf-16') as csvfile: reader = csv.DictReader(row.replace('\0', '') for row in csvfile) headers = reader.fieldnames ignore_columns_count = 0 header_string = f'{tab2}<thead>{tab3}<tr>' for i, header in enumerate(reader.fieldnames): if header in ignore_columns: ignore_columns_count += 1 continue # break long word(s) so that we can minimize columns widths if header == 'Comments': header = 'Com-<br>ments' elif header == 'Highest Pulse': header = 'Highest<br>Pulse' elif header == 'Galleries': header = 'Gal-<br>leries' header_string += f'''{tab4}<th>{header}</th>''' header_string += f'{tab3}</tr>{tab2}</thead>' # create rows for html table rows = list(reader) rows_count = len(rows) row_string = f'{tab2}<tbody>' for i, row in enumerate(rows): utils.update_progress( i / rows_count, f' - Writing items to html {i}/{rows_count} ...') row_string += f'{tab3}<tr>' for j in range(len(headers)): col_header = headers[j] if col_header in ignore_columns: continue text = row[headers[j]] # In Photo Tile column, show photo thumbnail and photo title with <a href> link if col_header == 'Photo Title': photo_thumbnail = f"{thumbnails_folder}/{row['Thumbnail Local']}" if use_local_thumbnails else row[ 'Thumbnail Href'] # if photo thumbnail is empty, write an empty div to keep the same layout if (use_local_thumbnails and not row['Thumbnail Local'] ) or (not use_local_thumbnails and not row['Thumbnail Href']): row_string += f'\t\t\t\t<td><div><div><a/></div><div></a></div></div></td> \n' else: photo_link = row['Href'] row_string += f'{tab4}<td><div><div style="width: 40%; float:left; margin-right:10px;">{tab7}<a href="{photo_link}" target="_blank">' row_string += f'{tab8}<img class="photo" src={photo_thumbnail}></a></div>' row_string += f'{tab6}<div><a href="{photo_link}" target="_blank">{text}</a></div></div></td>' elif col_header == 'Category': row_string += f'{tab4}<td class="alignLeft">{text}</td>' elif col_header == 'Tags': row_string += f'{tab4}<td class="alignLeft">{text}</td>' elif col_header == 'Featured In Galleries' and text != '': # a gallery link has this format: https://500px.com/[photographer_name]/galleries/[gallery_name] galleries = text.split(',') if len(galleries) == 0: row_string += f'{tab4}<td></td>' else: row_string += f'{tab4}<td>' for k, gallery in enumerate(galleries): gallery_name = gallery[gallery.rfind('/') + 1:] row_string += f'{tab4}<a href="{gallery}" target="_blank">{gallery_name}</a>' if k < len(galleries) - 1: row_string += ',' row_string += f'\t\t</td>' else: # write empty string if text == 0 alt_text = '' if text == '0' else text row_string += f'{tab4}<td>{alt_text}</td>' row_string += f'{tab3}</tr>\n' row_string += f'{tab2}</tbody>\n' # create the table headline with down-arrow symbol at the end: ex: <h4 id="unlisted_photos_header">Unlisted Photos (1) ⇩</h4> # we're gonna hide the un-important table on page load, so we won't need to show the down-arrow on the headlines of the main tables: direction_arrow = '' if csv_type.name is 'photos_public' else ' ⇩' headline_text = f'{TABLE_CAPTION[csv_type.name]} ({rows_count})' headline_id = f'{csv_type.name}_headline' headline_html_string = f'{tab1}<{headline_tag} id="{headline_id}">{headline_text} {direction_arrow}</{headline_tag}>' # table_id_string = f'id="{table_id}"' if table_id != '' else '' table_string = ( f'{tab1}<table id="{csv_type.name}" class="main" style="width:{table_width}">' #f'{tab2}<caption>{table_caption} ({rows_count})</caption>' f'{tab3}{CUSTOMED_COLUMN_WIDTHS}' f'{tab3}{header_string}' f'{tab3}{row_string}' f'{tab2}</table>') utils.update_progress( 1, f' - Writing items to html {rows_count}/{rows_count} ...') return headline_html_string + '\n' + table_string