Python get_web_pageの例、Scraper.tools.get_web_page Pythonの例

コード例 #1

0

ファイルを表示

def get_item_page(item, cnt, finished_items_html, category, http_pool):
    """
    Retrieve information for the current item and pass it on for processing
    :param item: BS4 object for item being processed
    :param cnt: Item number within the current section
    :param finished_items_html: BS4 object to obtain current section
    :param category: Section of item being processes
    :param http_pool: Pool for urllib3 requests
    :return:
    """

    # Get item directory and name
    item_name = item.contents[0].contents[0].contents[0].get('href')
    saved_item_name = item_name[6:].replace('%27', '\'').replace('_', ' ')

    # Retrieve the html page for the current item
    item_grid_html = my_tools.get_web_page(page_name=saved_item_name,
                                           path='/Items/',
                                           sub_path=category,
                                           http_pool=http_pool)

    # Parse current item html page and process the information
    with my_globals.bs4_lock:
        item_html = BeautifulSoup(item_grid_html, 'lxml')
    get_item_info(item_name=item_name,
                  cnt=cnt,
                  finished_items_html=finished_items_html,
                  item_html=item_html)

    # Signal current thread is done processing
    with my_globals.counter_lock:
        my_globals.thread_count -= 1

コード例 #2

0

ファイルを表示

def get_champ():
    """
    Get the stat information for each champion
    :return: return double array of each champion and their stats
    """

    champ_url = []  # Each champion wiki page
    all_champ_threads = []  # Hold all threads

    # Fetch main wiki page
    http_pool = urllib3.PoolManager()
    main_url = my_tools.get_web_page(page_name='League_of_Legends_Wiki',
                                     http_pool=http_pool)

    # Parse the HTML page for champion names
    with my_globals.bs4_lock:
        champions_html = BeautifulSoup(markup=main_url, features='lxml')
    champ_roster_ol = champions_html.find(class_="champion_roster")
    champ_roster_li = champ_roster_ol.find_all('a')

    # Get wiki page for each champion
    for champ_roster_name in champ_roster_li:
        champ_url.append(champ_roster_name.get('href').strip())

    my_tools.log_status("Getting champion info for;")

    for champ in champ_url:
        # Create a thread for each champion
        while True:
            # Only create new thread if limit has not been exceeded
            if my_globals.thread_count < my_globals.thread_max:
                # Signal a new thread is being created
                with my_globals.counter_lock:
                    my_globals.thread_count += 1

                # Create thread for current champion
                thread = my_globals.threading.Thread(target=get_champ_info,
                                                     args=(champ, http_pool),
                                                     name=champ)

                # Append curren thread to list and start thread
                all_champ_threads.append(thread)
                thread.start()

                # Exit loop once processing is done
                break

            # Wait if the thread queue is full
            time.sleep(2)

    # Wait for all threads to finish processing
    for thread in all_champ_threads:
        thread.join()

    #FOR DEBUGGING ONLY
    temp = my_globals.champion_info.copy()
    print()
    return

コード例 #3

0

ファイルを表示

ファイル: abilities.py プロジェクト: Debonairesnake6/SecureCalculator

def get_ability_info(champ, chrome):
    """
    Process all ability info
    :param champ: Champion being processed
    :param chrome: Selenium browser to load ability boxes
    :return:
    """

    # Log wich champion is currently being processed
    my_tools.log_status(champ)

    # Open champion page
    with my_globals.selenium_lock:
        ability_url = my_tools.get_web_page(page_name=champ,
                                            path='/Abilities',
                                            browser=chrome)
        abilities_html = BeautifulSoup(markup=ability_url, features='lxml')

    # Use regex to find each skill box
    passive_html = abilities_html.find_all(
        'div', {'class': re.compile('skill skill.*')})

    # Hold the current abilities for this champion
    current_abilities = {}
    cnt_test = 0  #DEBUG ONLY

    # Loop through each ability box
    for ability in passive_html:
        # 0 = passive
        # 1 = q
        # 2 = w
        # 3 = e
        # 4 = r or q2
        # 5 = w2
        # 6 = e2
        # 7 = r
        if cnt_test != 3:  # and cnt_test != 6: #DEBUG ONLY
            cnt_test += 1
            # continue
        else:
            cnt_test += 1

        # Get the button name
        button = ability.get('class')[1].split('_', 1)[1]
        if button == 'innate':
            button = 'passive'

        # Detect if already has ability key (eg. for Jayce/Rek'sai)
        try:
            if current_abilities[button]:
                button += '2'
        except KeyError:
            pass

        # Create entry for current button
        current_abilities[button] = {}

        # Get name for current ability
        ability = ability.contents[1].contents[2]
        current_abilities[button]['name'] = ability.get('id').replace(
            '_', ' ').replace('.27', '\'')

        # Get string for all ability stat
        info = ability.contents[1].contents[1].contents[0].contents[2]

        # Split up the list based on stats
        all_stats = info.text.split(':')

        # Detect if there is an html element (eg. image) in the text and remove it
        image_start = [stat for stat in all_stats if '<' in stat]
        for illegal in image_start:
            all_stats.remove(illegal)

        # Remove the remaining part of the tag
        image_end = [stat for stat in all_stats if '>' in stat]
        for illegal in image_end:
            location = [pos for pos, char in enumerate(illegal) if char == '>']
            legal = illegal[location[len(location) - 1] + 2:]
            for cnt, stat in enumerate(all_stats):
                if stat == illegal:
                    all_stats[cnt] = legal

        # Go throgh each stat
        for cnt, stat in enumerate(all_stats):

            # Don't process last one, already handled
            if cnt + 1 < len(all_stats):

                # Split up the current stat
                full_effect = ''
                effect = stat.split(' ')

                # Loop through each stat
                for word in effect:

                    # Upper case is often used for a new type of effect
                    if word.isupper():
                        if full_effect == '':
                            full_effect += word.lower()
                        else:
                            full_effect += ''.join([' ', word.lower()])

                    # These words are not in uppercase
                    elif word == 'On-Target':
                        full_effect += word.lower()
                    elif word == 'Cooldown':
                        full_effect += ''.join([' ', word.lower()])

                # Get the next stat
                full_value = ''
                value = all_stats[cnt + 1].split(' ')

                # Loop through for each number in the next stat
                for number in value:
                    # Often writen like 5/10/15 for values
                    if number.isdigit():
                        full_value += number
                    elif number == '/':

                        full_value += '/'

                    # Global is still a valid value
                    elif number == 'Global':
                        full_value += number

                    # Test if text has a decimal
                    else:
                        try:
                            # Will error out if word, but will take decimals (eg. 2.5)
                            float(number)
                            full_value += number
                        except ValueError:
                            pass

                # Put them together
                current_abilities[button][full_effect] = full_value.strip()

    # Add current ability to list
    my_globals.ability_info[champ.replace('\'', '_')] = current_abilities
    tmp = my_globals.ability_info.copy()  #DEBUG ONLY

    # Signal current thread is done processing
    with my_globals.counter_lock:
        my_globals.thread_count -= 1

コード例 #4

0

ファイルを表示

ファイル: abilities.py プロジェクト: Debonairesnake6/SecureCalculator

def get_abilities():

    champ_url = []  # Each champion wiki page
    all_ability_threads = []  # Hold all threads

    # Start headless chrome to get javascript from pages
    driver = webdriver.ChromeOptions()
    driver.add_argument('headless')

    # try:  #todo detect if selenium is installed
    # Current directory is Scraper\HTML Pages
    chrome = webdriver.Chrome(
        chrome_options=driver,
        executable_path='../Chrome Driver/chromedriver.exe')
    chrome.implicitly_wait(30)
    # except:
    #     pass

    # Fetch main wiki page
    http_pool = urllib3.PoolManager()
    main_url = my_tools.get_web_page(page_name='League_of_Legends_Wiki',
                                     http_pool=http_pool)

    # Parse the HTML page for champion names
    with my_globals.bs4_lock:
        champions_html = BeautifulSoup(markup=main_url, features='lxml')
    champ_roster_ol = champions_html.find(class_="champion_roster")
    champ_roster_li = champ_roster_ol.find_all('a')

    # Get wiki page for each champion
    for champ_roster_name in champ_roster_li:
        champ_url.append(champ_roster_name.get('href').strip())

    # General log message
    my_tools.log_status("Getting ability info for;")

    for champ in champ_url:

        # Change formatting for readability
        champ = champ[6:].replace('%27', '\'').replace('_', ' ')

        #FOR DEBUGGING ONLY
        if champ != "Akali":
            # continue
            pass

        # Create a thread for each champion
        while True:
            # Only create new thread if limit has not been exceeded
            if my_globals.thread_count < my_globals.thread_max:
                # Signal a new thread is being created
                with my_globals.counter_lock:
                    my_globals.thread_count += 1

                # Create thread for current champion
                thread = my_globals.threading.Thread(target=get_ability_info,
                                                     args=(champ, chrome),
                                                     name=champ)

                # Append curren thread to list and start thread
                all_ability_threads.append(thread)
                thread.start()

                # Exit loop once processing is done
                break

            # Wait if the thread queue is full
            time.sleep(2)

        #FOR DEBUGGING ONLY
        # break

    # Wait for all threads to finish processing
    for thread in all_ability_threads:
        thread.join()

    temp = my_globals.champion_info.copy()  #FOR DEBUGGING ONLY
    return

コード例 #5

0

ファイルを表示

def get_champ_info(champ, http_pool):

    champion_stats = {}  # Hold the stats for the current champion
    stat_type = [
        "Health",  # Keep track of each stat
        "HealthRegen",
        "ResourceBar",
        "ResourceRegen",
        "AttackDamage",
        "AttackSpeed",
        "Armor",
        "MagicResist",
        "MovementSpeed"
    ]

    # Open champion page
    main_url = my_tools.get_web_page(page_name=champ[6:].replace('%27',
                                                                 '\'').replace(
                                                                     '_', ' '),
                                     path='/Champions/',
                                     http_pool=http_pool)
    with my_globals.bs4_lock:
        champions_html = BeautifulSoup(markup=main_url, features='lxml')

    # Append stats to array
    for stat in stat_type:
        champ_roster_stat_html = champions_html.find(
            id=''.join([stat, "_", champ[6:].replace("%27", "_")]))

        # If the champion does not have that stat (eg. energy), write None instead
        try:
            champion_stats[stat] = champ_roster_stat_html.text
        except AttributeError:
            champion_stats[stat] = '0'

    # Append stats/lvl to array
    for stat in stat_type:
        # Attack speed is named differently on site
        if stat == "AttackSpeed":
            stat = "AttackSpeedBonus"

        champ_roster_stat_html = champions_html.find(
            id=''.join([stat, "_", champ[6:].replace("%27", "_"), "_lvl"]))

        # If the champion does not scale in that stat, write 0 instead
        try:
            champion_stats[''.join([stat,
                                    '/lvl'])] = champ_roster_stat_html.text[2:]
        except AttributeError:
            champion_stats[''.join([stat, '/lvl'])] = '0'

    # Find the mana type, location of "Secondary Bar:" test
    champions_resource_html = champions_html.find(
        style="font-size:10px; line-height:1em; display:block; "
        "color:rgb(147, 115, 65); margin-top:3px; margin-bottom:0;")

    # Try and get the direct path of the bar
    try:
        champ_resource = champions_resource_html.next_sibling.next_element.contents[
            2].text
    except IndexError:
        champ_resource = "Manaless"
    # Add stat to stat array
    champion_stats['ResourceType'] = champ_resource

    # Write champs with stats into array
    my_globals.champion_info[champ[6:].replace("%27", "-")] = champion_stats

    my_tools.log_status(champ[6:])

    # Signal thread is complete
    with my_globals.counter_lock:
        my_globals.thread_count -= 1

コード例 #6

0

ファイルを表示

def get_item(home_directory):
    """
    Return all item information from all maps
    :return: item information
    """

    # Log current status of program
    my_tools.log_status('Getting Item Grid')

    # Change directory to HTML pages
    os.chdir(''.join([home_directory, '/HTML Pages']))

    # Create urllib3 pool to download each web page
    http_pool = urllib3.PoolManager()
    main_url = my_tools.get_web_page(page_name='Item',
                                     path='/Items',
                                     http_pool=http_pool)

    # For formatting
    my_tools.log_status('\n')

    # Use the item page and set up parsing
    with my_globals.bs4_lock:
        item_grid_html = BeautifulSoup(markup=main_url, features='lxml')

    # Find the item grid and start to parse
    finished_items_html = item_grid_html.find(id='item-grid')

    # Loop through item grid for each item section
    for cnt, null in enumerate(finished_items_html.contents):

        # Add section to dictionary
        if cnt % 4 == 1:
            # Save current section being worked on
            category = finished_items_html.contents[cnt].text.strip()

            # Skip sections not used by calculator
            if category == 'Potions and Consumables' or \
               category == 'Distributed' or \
               category == 'Removed items' or \
               category == 'Trinkets':
                continue

            # Log status of program
            my_tools.log_status(''.join([
                'Starting Section: ',
                finished_items_html.contents[cnt].text.strip()
            ]))

            # Create entry for current section in global dictionary
            my_globals.item_info[
                finished_items_html.contents[cnt].text.strip()] = {}

        # Search though section for items
        if cnt % 4 == 3:
            # Save current section being worked on
            category = finished_items_html.contents[cnt - 2].text.strip()

            # Skip sections not used by calculator
            if category == 'Potions and Consumables' or \
               category == 'Distributed' or \
               category == 'Removed items' or \
               category == 'Trinkets':
                continue

            # Array to hold threads
            all_item_threads = []

            # Get the page for each item in the category and start to parse
            for item in finished_items_html.contents[cnt]:
                # Save item path and readable names
                item_name = item.contents[0].contents[0].contents[0].get(
                    'href')
                current_item_name = item_name[6:].replace('%27', '\'').replace(
                    '_', ' ')

                # Create thread for each item being parsed
                while True:
                    # Only create a thread if limit has not been exceeded
                    if my_globals.thread_count < my_globals.thread_max:
                        # Signal a new thread is being created
                        with my_globals.counter_lock:
                            my_globals.thread_count += 1

                        # Create thread and process each item
                        thread = threading.Thread(target=get_item_page,
                                                  args=(item, cnt,
                                                        finished_items_html,
                                                        category, http_pool),
                                                  name=current_item_name)

                        # Append current thread to list and start thread
                        all_item_threads.append(thread)
                        thread.start()

                        # Exit loop once processing is done
                        break

                    # Wait if a thread queue is full
                    time.sleep(2)
                # break

            # Wait for all threads to finish processing
            for thread in all_item_threads:
                thread.join()

                # For formatting
                my_tools.log_status('\n')

            #FOR DEBUGGING, STOP AFTER FIRST SECTION
            # break
    #FOR DEBUGGING, CREATE LOCAL COPY AS GLOBAL VARIABLE DOES NOT SHOW UP IN THE DEBUGGER
    temp = my_globals.item_info.copy()
    return