Example #1
0
def scroll_to_end_by_class_name(driver, class_name, number_requested):
    """Scroll the active window to the end, where the last element of the given class name become visible.

    Argument 'number_requested' is used for creating a realistic progress bar
    """
    eles = driver.find_elements_by_class_name(class_name)
    count = 0
    new_count = len(eles)

    while new_count != count:
        try:
            utils.update_progress(
                new_count / number_requested,
                f'    - Scrolling down to load more items {new_count}/{number_requested}:'
            )
            the_last_in_list = eles[-1]
            the_last_in_list.location_once_scrolled_into_view
            time.sleep(random.randint(15, 20) / 10)
            try:
                WebDriverWait(driver, timeout=60).until(
                    EC.visibility_of(the_last_in_list))
            except TimeoutException:
                pass
            count = new_count
            eles = driver.find_elements_by_class_name(class_name)
            new_count = len(eles)
        except TimeoutException:
            printR(f'   Time out while scrolling down. Please retry.')
        except NoSuchElementException:
            pass
    if new_count < number_requested:
        utils.update_progress(
            1,
            f'    - Scrolling down to load more items:{new_count}/{number_requested}'
        )
Example #2
0
def scroll_down_active_page(driver,
                            web_element=None,
                            class_name_to_check='',
                            tag_name_to_check='',
                            xpath_to_check='',
                            number_requested=100,
                            message='',
                            time_out=60):
    """Scrolling down the active window until all the request items of a given class name or a tag name, are loaded.

    - The process monitors the change of the page height to decide if another scroll down is needed
      After a scroll down, if the server fails to load new items within a given time out (default is 60s), the process will stop 
    - If both class name and tag name are given, class name take priority. if none is given, no action is taken
    - Message is the text shown on the progress bar

"""
    if web_element is None:
        web_element = driver
    if class_name_to_check:
        items = web_element.find_elements_by_class_name(class_name_to_check)
    elif tag_name_to_check:
        items = web_element.find_elements_by_tag_name(tag_name_to_check)
    elif xpath_to_check:
        items = web_element.find_elements_by_xpath(xpath_to_check)
    else:
        printR('   Items were not specified. The process stopped.')
        return
    if items is None or len(items) == 0:
        printR('   No items found.')
        return

    if len(items) >= number_requested:
        return

    # get the current height of the page
    last_scroll_height = driver.execute_script(
        "return document.body.scrollHeight")

    time_out_count_down = time_out
    count_sofar = 0
    if number_requested == -1:
        number_requested = config.MAX_NOTIFICATION_REQUEST

    while count_sofar < number_requested:
        utils.update_progress(
            count_sofar / number_requested,
            f'    - Scrolling down {count_sofar}/{number_requested}')

        # scroll down to bottom
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        new_scroll_height = driver.execute_script(
            "return document.body.scrollHeight")

        # give the slow server a chance to load the new items
        while new_scroll_height == last_scroll_height and time_out_count_down >= 0:
            time_out_count_down -= 1
            #web_element.send_keys(Keys.END)
            new_scroll_height = driver.execute_script(
                "return document.body.scrollHeight")
            time.sleep(1)

        last_scroll_height = new_scroll_height

        if class_name_to_check:
            items = web_element.find_elements_by_class_name(
                class_name_to_check)
        elif tag_name_to_check:
            items = web_element.find_elements_by_tag_name(tag_name_to_check)
        elif xpath_to_check:
            items = web_element.find_elements_by_xpath(xpath_to_check)

        count_sofar = len(items)

        if count_sofar < number_requested and time_out_count_down <= 0:
            printR(
                f'\n   Time out ({time_out}s)! {count_sofar}/{number_requested} items obtained. You may try again at another time'
            )
            break

    # normal termination of while loop: show completed progress bar
    else:
        utils.update_progress(
            1, f'    - Scrolling down {number_requested}/{number_requested}')

    return items
Example #3
0
def scroll_to_end_by_tag_name_within_element(driver,
                                             element,
                                             tag_name,
                                             number_requested,
                                             time_out=20):
    """Scroll the active window to the end, where the last element of the given tag name is loaded and visible.

    Argument 'number_requested' is used for creating a realistic progress bar
    """
    eles = check_and_get_all_elements_by_tag_name(element, tag_name)
    count = 0
    new_count = len(eles)
    count_down_timer = time_out
    while new_count != count:
        try:
            utils.update_progress(
                new_count / number_requested,
                f'    - Scrolling down to load more items {new_count}/{number_requested}:'
            )
            the_last_in_list = eles[-1]
            the_last_in_list.location_once_scrolled_into_view
            time.sleep(1)
            try:
                WebDriverWait(driver, time_out).until(
                    EC.visibility_of(the_last_in_list))
            except TimeoutException:
                pass

            count = new_count
            eles = check_and_get_all_elements_by_tag_name(element, tag_name)
            new_count = len(eles)

            # give the slow server a chance to load the new items
            while new_count == count and count_down_timer >= 0 and new_count < number_requested:
                utils.update_progress(
                    count_down_timer / time_out,
                    f'    - Slow response from server. Counting down {count_down_timer}:'
                )
                count_down_timer -= 1

                eles = check_and_get_all_elements_by_tag_name(
                    element, tag_name)
                new_count = len(eles)
                the_last_in_list = eles[-1]
                the_last_in_list.location_once_scrolled_into_view
                time.sleep(1)

        except TimeoutException:
            printR(
                f'   Time out ({time_out}s) while scrolling down. Please retry.'
            )
        except NoSuchElementException:
            pass
    if new_count >= number_requested:
        utils.update_progress(
            1,
            f'    - Scrolling down to load more items:{number_requested} / {number_requested}'
        )
    else:
        # scroll down has finished, but the items obtained are less than requested. Show it
        utils.update_progress(
            1,
            f'    - Scrolling down to load more items:{new_count} / {number_requested}'
        )

    return eles
Example #4
0
def scroll_down(driver,
                scroll_pause_time=0.5,
                number_of_scrolls=10,
                estimate_scrolls_needed=3,
                message=''):
    """Scrolling down the active window in a controllable fashion.

    Passing the scroll_pause_time according to the content of the page, to make sure all items are loaded before the next scroll. default is 0.5s
    The page could have a very long list, or almost infinity, so by default we limit it to 10 times.
    If number_of_scrolls =  0, return without scrolling
    If number_of_scrolls = -1, keep scrolling until the end is reached ...
    for this case, in order to have a realistic progress bar, we will use estimate_scrolls_needed  ( total request items / load items per scroll )
    Message is a string described the title of the progress bar. if empty is passed, the progress bar will not be stimulated
    """
    if number_of_scrolls == 0:
        return

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")
    iteration_count = 0
    scrolls_count_for_stimulated_progressbar = 0
    while True:
        if number_of_scrolls == -1:
            # if we were able to give an estimate of number of scrolls needed (ex. number of photos, followers, friends are known)
            if estimate_scrolls_needed != -1:
                utils.update_progress(
                    scrolls_count_for_stimulated_progressbar /
                    estimate_scrolls_needed, message)
            # here, we dont know when it ends (for example, we ask for all notifications, but we don't know how many the 500px server will provide)
            else:
                notifications_loaded_so_far = scrolls_count_for_stimulated_progressbar * config.NOTIFICATION_PER_LOAD
                text = f'\r{message} {str(notifications_loaded_so_far)}'
                sys.stdout.write(text)
                sys.stdout.flush()
        elif iteration_count > 0:
            utils.update_progress(iteration_count / number_of_scrolls, message)

        scrolls_count_for_stimulated_progressbar += 1

        # Scroll down to bottom
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")

        # Wait for page to load
        time.sleep(scroll_pause_time)
        innerHTML = driver.execute_script(
            "return document.body.innerHTML"
        )  #make sure document javascript is executed

        # exit point #1 : when number of scrolls requested has been reached
        if number_of_scrolls != -1:
            iteration_count = iteration_count + 1
            if iteration_count >= number_of_scrolls:
                break

        #  exit point #2: when all items are loaded (by calculating new scroll height and compare with last scroll height)
        #                 or when the server stop responding after the given sleep time (scroll_pause_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # mark the end of the progress bar update
    if number_of_scrolls == -1 and estimate_scrolls_needed == -1:  # indeterminate number of scrolls
        sys.stdout.write('\r\n')  # end the progress update with a line-feed
        sys.stdout.flush()
    else:
        utils.update_progress(1, message)  # force the display of "100% Done"

    time.sleep(scroll_pause_time)
Example #5
0
def CSV_list_to_HTML_table(csv_file_name,
                           csv_file_type,
                           output_lists,
                           use_local_thumbnails=True,
                           ignore_columns=None,
                           encoding='utf-16',
                           column_to_sort='No',
                           start_indent=1,
                           headline_tag='h4'):
    """Given a csv file containing a list of items, return a html string containing a description headline and a html table representing items.

     - we use the csv_type.name as the base text to construct table id, headline id and headline text
       table id = csv_type.name
       headline id = [csv_type.name]_headline
       headline text = TABLE_CAPTION["csv_type.name"] (# of photo)
       sample of headline string: <h4 id="followers_headline">Followers (1234)</h4>
     - We hide the columns specified in the given IGNORE_COLUMNS LIST. 
     - Support various types of csv files, which are lists of:
       notifications, unique_users, like_actors, followers, followings, all_users, reciprocal, not_follow, following
     - Return saved html file name (the same name as csv file but with extension '.html'   )
    """
    if csv_file_name == '':
        return ''

    global HEAD_STRING, TABLE_WIDTHS, TABLE_CAPTION, COL_WIDTHS

    tab1 = '\n' + '\t' * start_indent
    tab2 = tab1 + '\t'
    tab3 = tab2 + '\t'
    tab4 = tab3 + '\t'
    tab5 = tab4 + '\t'
    tab6 = tab5 + '\t'
    tab7 = tab6 + '\t'
    tab8 = tab7 + '\t'
    tab9 = tab8 + '\t'

    # file extension check
    file_path, file_extension = os.path.splitext(csv_file_name)
    if file_extension != ".csv":
        return ''
    html_full_file_name = file_path + '.html'

    if ignore_columns is None:
        ignore_columns = []

    table_id = 'main_table'

    avatars_folder = os.path.basename(
        os.path.normpath(output_lists.avatars_dir))
    thumbnails_folder = os.path.basename(
        os.path.normpath(output_lists.thumbnails_dir))

    main_table_width = TABLE_WIDTHS[csv_file_type.name]

    with open(csv_file_name, newline='', encoding=encoding) as csvfile:
        reader = csv.DictReader(row.replace('\0', '') for row in csvfile)
        headers = reader.fieldnames
        if len(headers) < 3:
            printR(f'   File {csv_file_name} is in wrong format!')
            return ''

        # write headers and assign appropriate sort method for each columns
        # # each header cell has 2 parts: the left div for the header name, the right div for sort direction arrows
        ignore_columns_count = 0
        header_string = f'{tab3}<thead>{tab4}<tr>'
        for i, header in enumerate(reader.fieldnames):
            if header in ignore_columns:
                ignore_columns_count += 1
                continue

            col_width = f'width="{COL_WIDTHS[header]}"'
            header_string += f'''{tab5}<th {col_width}><div class="hdr_text">{header}</div></th>'''
        header_string += '</tr></thead>'

        # create rows for html table
        rows = list(reader)
        rows_count = len(rows)
        row_string = f'{tab3}<tbody>'
        for i, row in enumerate(rows):
            utils.update_progress(
                i / rows_count,
                f'    - Writing items to html {i}/{rows_count} ...')
            row_string += f'{tab4}<tr>'

            for j in range(len(headers)):
                col_header = headers[j]
                # ignore unwanted columns
                if col_header in ignore_columns:
                    continue
                text = row[col_header]

                # In Display Name column, show user's avatar and the display name with link
                if col_header == 'Display Name':
                    user_home_page = f'https://500px.com/{row["User Name"]}'
                    user_name = row["Display Name"]
                    row_string += f'{tab5}<td><div><div style="width: 30%; float:left;">{tab8}<a href="{user_home_page}" target="_blank">'
                    if use_local_thumbnails:
                        user_avatar = f"{avatars_folder}/{row['Avatar Local']}"
                    else:
                        user_avatar = row['Avatar Href']

                    row_string += f'{tab9}<img src={user_avatar}></a></div>'
                    row_string += f'{tab7}<div><a href="{user_home_page}" target="_blank">{user_name}</a></div></div></td>'

                # In Photo Tile column, show photo thumbnail and photo title with <a href> link
                elif col_header == 'Photo Title':
                    photo_thumbnail = f"{thumbnails_folder}/{row['Photo Thumbnail Local']}" if use_local_thumbnails else row[
                        'Photo Thumbnail Href']
                    # if photo thumbnail is empty, write an empty divs to keep the same layout
                    if (use_local_thumbnails
                            and not row['Photo Thumbnail Local'].strip()) or (
                                not use_local_thumbnails
                                and not row['Photo Thumbnail Href'].strip()):
                        row_string += f'{tab5}<td><div><div><a/></div><div><a/></div></div></td>'
                    else:
                        photo_link = row['Photo Link']
                        row_string += f'{tab5}<td><div><div style="width: 30%; float:left;">{tab9}<a href="{photo_link}" target="_blank">'
                        row_string += f'{tab9}<img class="photo" src={photo_thumbnail}></a></div>'
                        row_string += f'{tab7}<div><a href="{photo_link}" target="_blank">{text}</a></div></div></td>'

                elif col_header == 'Relationship':
                    color_class_name = text.lower().replace(' ', '_')
                    if csv_file_type.name == 'reciprocal' or csv_file_type.name == 'not_follow' or csv_file_type.name == 'following' or \
                       csv_file_type.name == 'all_users' or csv_file_type.name == 'unique_users' or csv_file_type.name == 'all_unique_users':
                        row_string += f'{tab5}<td class="alignLeft {color_class_name}" >{text}</td>'
                    elif csv_file_type.name == 'notifications' or csv_file_type.name == 'like_actors':
                        if text == 'Following':
                            row_string += f'{tab5}<td class="alignLeft following_raw">{text}</td>'  # green cell for following users from notification
                        elif text == 'Not Follow':
                            row_string += f'{tab5}<td class="alignLeft">{text}</td>'  # default background color (white)
                        else:
                            row_string += f'{tab5}<td></td>'  # empty td cell
                    else:
                        row_string += f'{tab5}<td>{text}</td>'

                elif col_header == 'Content':
                    row_string += f'{tab5}<td class="alignCenter">{text}</td>'

                else:
                    row_string += f'{tab5}<td class="alignRight">{text}</td>'

            row_string += '</tr>'
        row_string += f'{tab3}</tbody>'

        # create the main table headline ex: 	<h4 id="followers_headline">Followers (1234) </h4>
        headline_text = f'{TABLE_CAPTION[csv_file_type.name]} ({rows_count})'
        headline_id = f'{csv_file_type.name}_headline'
        headline_html_string = f'{tab1}<{headline_tag} id="{headline_id}">{headline_text}</{headline_tag}>'

        table_string = (
            f'{tab1}<div class="float_left" style="width:{main_table_width}">'
            f'{tab2}<table id="{table_id}">'
            f'{tab3}{header_string}'
            f'{tab3}{row_string}'
            f'{tab2}</table>{tab1}</div>')

        utils.update_progress(
            1, f'    - Writing items to html {rows_count}/{rows_count} ...')
    return headline_html_string + table_string
Example #6
0
def CSV_photos_list_to_HTML_table(csv_file_name,
                                  csv_type,
                                  output_lists,
                                  use_local_thumbnails=True,
                                  ignore_columns=None,
                                  start_indent=1,
                                  headline_tag='h4'):
    """ Given a csv file containing a list of photos, return a html string containing a description headline and the photos html table.

    - we use the csv_type.name as the base text to construct table id, headline id and headline text
      table id = csv_type.name
      headline id = csv_type.name_headline
      headline text = TABLE_CAPTION["csv_type.name"] (# of photo)
      sample of headline string: <h4 id="unlisted_photos_header">Unlisted Photos (1) &nbsp;&nbsp; &#8681;</h4>
    - We hide the columns specified in the given IGNORE_COLUMNS LIST. (The data in these columns are still being used to form the web link tag <a href=...>
    """
    global TABLE_WIDTHS, TABLE_CAPTION

    tab1 = '\t' * start_indent
    tab2 = '\n' + tab1 + '\t'
    tab3 = tab2 + '\t'
    tab4 = tab3 + '\t'
    tab5 = tab4 + '\t'
    tab6 = tab5 + '\t'
    tab7 = tab6 + '\t'
    tab7 = tab6 + '\t'
    tab8 = tab7 + '\t'

    if ignore_columns is None:
        ignore_columns = []

    CUSTOMED_COLUMN_WIDTHS = """
        <colgroup>
		    <col style="width:4%">    
		    <col style="width:15%">
		    <col span= "5" style="width:6%" >
		    <col style="width:5%" >
		    <col style="width:8%" >
		    <col style="width:15%">	
		    <col style="width:23%">				
	    </colgroup> """

    # file name and extension check
    file_path, file_extension = os.path.splitext(csv_file_name)
    if file_extension != ".csv":
        return None

    html_file = file_path + '.html'
    avatars_folder = os.path.basename(
        os.path.normpath(output_lists.avatars_dir))
    thumbnails_folder = os.path.basename(
        os.path.normpath(output_lists.thumbnails_dir))
    table_width = TABLE_WIDTHS[f'{csv_type.name}']

    with open(csv_file_name, newline='', encoding='utf-16') as csvfile:
        reader = csv.DictReader(row.replace('\0', '') for row in csvfile)
        headers = reader.fieldnames
        ignore_columns_count = 0
        header_string = f'{tab2}<thead>{tab3}<tr>'

        for i, header in enumerate(reader.fieldnames):
            if header in ignore_columns:
                ignore_columns_count += 1
                continue
            # break long word(s) so that we can minimize columns widths
            if header == 'Comments': header = 'Com-<br>ments'
            elif header == 'Highest Pulse': header = 'Highest<br>Pulse'
            elif header == 'Galleries': header = 'Gal-<br>leries'
            header_string += f'''{tab4}<th>{header}</th>'''

        header_string += f'{tab3}</tr>{tab2}</thead>'

        # create rows for html table
        rows = list(reader)
        rows_count = len(rows)
        row_string = f'{tab2}<tbody>'
        for i, row in enumerate(rows):
            utils.update_progress(
                i / rows_count,
                f'    - Writing items to html {i}/{rows_count} ...')
            row_string += f'{tab3}<tr>'

            for j in range(len(headers)):
                col_header = headers[j]
                if col_header in ignore_columns:
                    continue
                text = row[headers[j]]
                # In Photo Tile column, show photo thumbnail and photo title with <a href> link
                if col_header == 'Photo Title':
                    photo_thumbnail = f"{thumbnails_folder}/{row['Thumbnail Local']}" if use_local_thumbnails else row[
                        'Thumbnail Href']

                    # if photo thumbnail is empty, write an empty div to keep the same layout
                    if (use_local_thumbnails and not row['Thumbnail Local']
                        ) or (not use_local_thumbnails
                              and not row['Thumbnail Href']):
                        row_string += f'\t\t\t\t<td><div><div><a/></div><div></a></div></div></td> \n'
                    else:
                        photo_link = row['Href']
                        row_string += f'{tab4}<td><div><div style="width: 40%; float:left; margin-right:10px;">{tab7}<a href="{photo_link}" target="_blank">'
                        row_string += f'{tab8}<img class="photo" src={photo_thumbnail}></a></div>'
                        row_string += f'{tab6}<div><a href="{photo_link}" target="_blank">{text}</a></div></div></td>'

                elif col_header == 'Category':
                    row_string += f'{tab4}<td class="alignLeft">{text}</td>'

                elif col_header == 'Tags':
                    row_string += f'{tab4}<td class="alignLeft">{text}</td>'

                elif col_header == 'Featured In Galleries' and text != '':
                    # a gallery link has this format: https://500px.com/[photographer_name]/galleries/[gallery_name]
                    galleries = text.split(',')
                    if len(galleries) == 0:
                        row_string += f'{tab4}<td></td>'
                    else:
                        row_string += f'{tab4}<td>'
                        for k, gallery in enumerate(galleries):
                            gallery_name = gallery[gallery.rfind('/') + 1:]
                            row_string += f'{tab4}<a href="{gallery}" target="_blank">{gallery_name}</a>'
                            if k < len(galleries) - 1:
                                row_string += ','
                        row_string += f'\t\t</td>'

                else:
                    # write empty string if text == 0
                    alt_text = '' if text == '0' else text
                    row_string += f'{tab4}<td>{alt_text}</td>'

            row_string += f'{tab3}</tr>\n'
        row_string += f'{tab2}</tbody>\n'

        # create the table headline with down-arrow symbol at the end: ex: 	<h4 id="unlisted_photos_header">Unlisted Photos (1) &nbsp;&nbsp; &#8681;</h4>
        # we're gonna hide the un-important table on page load, so we won't need to show the down-arrow on the headlines of the main tables:
        direction_arrow = '' if csv_type.name is 'photos_public' else '&nbsp;&nbsp; &#8681;'
        headline_text = f'{TABLE_CAPTION[csv_type.name]}  ({rows_count})'
        headline_id = f'{csv_type.name}_headline'
        headline_html_string = f'{tab1}<{headline_tag} id="{headline_id}">{headline_text} {direction_arrow}</{headline_tag}>'

        # table_id_string = f'id="{table_id}"' if table_id != '' else ''
        table_string = (
            f'{tab1}<table id="{csv_type.name}" class="main" style="width:{table_width}">'
            #f'{tab2}<caption>{table_caption} ({rows_count})</caption>'
            f'{tab3}{CUSTOMED_COLUMN_WIDTHS}'
            f'{tab3}{header_string}'
            f'{tab3}{row_string}'
            f'{tab2}</table>')

        utils.update_progress(
            1, f'    - Writing items to html {rows_count}/{rows_count} ...')
    return headline_html_string + '\n' + table_string