def get_item_page(item, cnt, finished_items_html, category, http_pool): """ Retrieve information for the current item and pass it on for processing :param item: BS4 object for item being processed :param cnt: Item number within the current section :param finished_items_html: BS4 object to obtain current section :param category: Section of item being processes :param http_pool: Pool for urllib3 requests :return: """ # Get item directory and name item_name = item.contents[0].contents[0].contents[0].get('href') saved_item_name = item_name[6:].replace('%27', '\'').replace('_', ' ') # Retrieve the html page for the current item item_grid_html = my_tools.get_web_page(page_name=saved_item_name, path='/Items/', sub_path=category, http_pool=http_pool) # Parse current item html page and process the information with my_globals.bs4_lock: item_html = BeautifulSoup(item_grid_html, 'lxml') get_item_info(item_name=item_name, cnt=cnt, finished_items_html=finished_items_html, item_html=item_html) # Signal current thread is done processing with my_globals.counter_lock: my_globals.thread_count -= 1
def get_champ(): """ Get the stat information for each champion :return: return double array of each champion and their stats """ champ_url = [] # Each champion wiki page all_champ_threads = [] # Hold all threads # Fetch main wiki page http_pool = urllib3.PoolManager() main_url = my_tools.get_web_page(page_name='League_of_Legends_Wiki', http_pool=http_pool) # Parse the HTML page for champion names with my_globals.bs4_lock: champions_html = BeautifulSoup(markup=main_url, features='lxml') champ_roster_ol = champions_html.find(class_="champion_roster") champ_roster_li = champ_roster_ol.find_all('a') # Get wiki page for each champion for champ_roster_name in champ_roster_li: champ_url.append(champ_roster_name.get('href').strip()) my_tools.log_status("Getting champion info for;") for champ in champ_url: # Create a thread for each champion while True: # Only create new thread if limit has not been exceeded if my_globals.thread_count < my_globals.thread_max: # Signal a new thread is being created with my_globals.counter_lock: my_globals.thread_count += 1 # Create thread for current champion thread = my_globals.threading.Thread(target=get_champ_info, args=(champ, http_pool), name=champ) # Append curren thread to list and start thread all_champ_threads.append(thread) thread.start() # Exit loop once processing is done break # Wait if the thread queue is full time.sleep(2) # Wait for all threads to finish processing for thread in all_champ_threads: thread.join() #FOR DEBUGGING ONLY temp = my_globals.champion_info.copy() print() return
def get_ability_info(champ, chrome): """ Process all ability info :param champ: Champion being processed :param chrome: Selenium browser to load ability boxes :return: """ # Log wich champion is currently being processed my_tools.log_status(champ) # Open champion page with my_globals.selenium_lock: ability_url = my_tools.get_web_page(page_name=champ, path='/Abilities', browser=chrome) abilities_html = BeautifulSoup(markup=ability_url, features='lxml') # Use regex to find each skill box passive_html = abilities_html.find_all( 'div', {'class': re.compile('skill skill.*')}) # Hold the current abilities for this champion current_abilities = {} cnt_test = 0 #DEBUG ONLY # Loop through each ability box for ability in passive_html: # 0 = passive # 1 = q # 2 = w # 3 = e # 4 = r or q2 # 5 = w2 # 6 = e2 # 7 = r if cnt_test != 3: # and cnt_test != 6: #DEBUG ONLY cnt_test += 1 # continue else: cnt_test += 1 # Get the button name button = ability.get('class')[1].split('_', 1)[1] if button == 'innate': button = 'passive' # Detect if already has ability key (eg. for Jayce/Rek'sai) try: if current_abilities[button]: button += '2' except KeyError: pass # Create entry for current button current_abilities[button] = {} # Get name for current ability ability = ability.contents[1].contents[2] current_abilities[button]['name'] = ability.get('id').replace( '_', ' ').replace('.27', '\'') # Get string for all ability stat info = ability.contents[1].contents[1].contents[0].contents[2] # Split up the list based on stats all_stats = info.text.split(':') # Detect if there is an html element (eg. image) in the text and remove it image_start = [stat for stat in all_stats if '<' in stat] for illegal in image_start: all_stats.remove(illegal) # Remove the remaining part of the tag image_end = [stat for stat in all_stats if '>' in stat] for illegal in image_end: location = [pos for pos, char in enumerate(illegal) if char == '>'] legal = illegal[location[len(location) - 1] + 2:] for cnt, stat in enumerate(all_stats): if stat == illegal: all_stats[cnt] = legal # Go throgh each stat for cnt, stat in enumerate(all_stats): # Don't process last one, already handled if cnt + 1 < len(all_stats): # Split up the current stat full_effect = '' effect = stat.split(' ') # Loop through each stat for word in effect: # Upper case is often used for a new type of effect if word.isupper(): if full_effect == '': full_effect += word.lower() else: full_effect += ''.join([' ', word.lower()]) # These words are not in uppercase elif word == 'On-Target': full_effect += word.lower() elif word == 'Cooldown': full_effect += ''.join([' ', word.lower()]) # Get the next stat full_value = '' value = all_stats[cnt + 1].split(' ') # Loop through for each number in the next stat for number in value: # Often writen like 5/10/15 for values if number.isdigit(): full_value += number elif number == '/': full_value += '/' # Global is still a valid value elif number == 'Global': full_value += number # Test if text has a decimal else: try: # Will error out if word, but will take decimals (eg. 2.5) float(number) full_value += number except ValueError: pass # Put them together current_abilities[button][full_effect] = full_value.strip() # Add current ability to list my_globals.ability_info[champ.replace('\'', '_')] = current_abilities tmp = my_globals.ability_info.copy() #DEBUG ONLY # Signal current thread is done processing with my_globals.counter_lock: my_globals.thread_count -= 1
def get_abilities(): champ_url = [] # Each champion wiki page all_ability_threads = [] # Hold all threads # Start headless chrome to get javascript from pages driver = webdriver.ChromeOptions() driver.add_argument('headless') # try: #todo detect if selenium is installed # Current directory is Scraper\HTML Pages chrome = webdriver.Chrome( chrome_options=driver, executable_path='../Chrome Driver/chromedriver.exe') chrome.implicitly_wait(30) # except: # pass # Fetch main wiki page http_pool = urllib3.PoolManager() main_url = my_tools.get_web_page(page_name='League_of_Legends_Wiki', http_pool=http_pool) # Parse the HTML page for champion names with my_globals.bs4_lock: champions_html = BeautifulSoup(markup=main_url, features='lxml') champ_roster_ol = champions_html.find(class_="champion_roster") champ_roster_li = champ_roster_ol.find_all('a') # Get wiki page for each champion for champ_roster_name in champ_roster_li: champ_url.append(champ_roster_name.get('href').strip()) # General log message my_tools.log_status("Getting ability info for;") for champ in champ_url: # Change formatting for readability champ = champ[6:].replace('%27', '\'').replace('_', ' ') #FOR DEBUGGING ONLY if champ != "Akali": # continue pass # Create a thread for each champion while True: # Only create new thread if limit has not been exceeded if my_globals.thread_count < my_globals.thread_max: # Signal a new thread is being created with my_globals.counter_lock: my_globals.thread_count += 1 # Create thread for current champion thread = my_globals.threading.Thread(target=get_ability_info, args=(champ, chrome), name=champ) # Append curren thread to list and start thread all_ability_threads.append(thread) thread.start() # Exit loop once processing is done break # Wait if the thread queue is full time.sleep(2) #FOR DEBUGGING ONLY # break # Wait for all threads to finish processing for thread in all_ability_threads: thread.join() temp = my_globals.champion_info.copy() #FOR DEBUGGING ONLY return
def get_champ_info(champ, http_pool): champion_stats = {} # Hold the stats for the current champion stat_type = [ "Health", # Keep track of each stat "HealthRegen", "ResourceBar", "ResourceRegen", "AttackDamage", "AttackSpeed", "Armor", "MagicResist", "MovementSpeed" ] # Open champion page main_url = my_tools.get_web_page(page_name=champ[6:].replace('%27', '\'').replace( '_', ' '), path='/Champions/', http_pool=http_pool) with my_globals.bs4_lock: champions_html = BeautifulSoup(markup=main_url, features='lxml') # Append stats to array for stat in stat_type: champ_roster_stat_html = champions_html.find( id=''.join([stat, "_", champ[6:].replace("%27", "_")])) # If the champion does not have that stat (eg. energy), write None instead try: champion_stats[stat] = champ_roster_stat_html.text except AttributeError: champion_stats[stat] = '0' # Append stats/lvl to array for stat in stat_type: # Attack speed is named differently on site if stat == "AttackSpeed": stat = "AttackSpeedBonus" champ_roster_stat_html = champions_html.find( id=''.join([stat, "_", champ[6:].replace("%27", "_"), "_lvl"])) # If the champion does not scale in that stat, write 0 instead try: champion_stats[''.join([stat, '/lvl'])] = champ_roster_stat_html.text[2:] except AttributeError: champion_stats[''.join([stat, '/lvl'])] = '0' # Find the mana type, location of "Secondary Bar:" test champions_resource_html = champions_html.find( style="font-size:10px; line-height:1em; display:block; " "color:rgb(147, 115, 65); margin-top:3px; margin-bottom:0;") # Try and get the direct path of the bar try: champ_resource = champions_resource_html.next_sibling.next_element.contents[ 2].text except IndexError: champ_resource = "Manaless" # Add stat to stat array champion_stats['ResourceType'] = champ_resource # Write champs with stats into array my_globals.champion_info[champ[6:].replace("%27", "-")] = champion_stats my_tools.log_status(champ[6:]) # Signal thread is complete with my_globals.counter_lock: my_globals.thread_count -= 1
def get_item(home_directory): """ Return all item information from all maps :return: item information """ # Log current status of program my_tools.log_status('Getting Item Grid') # Change directory to HTML pages os.chdir(''.join([home_directory, '/HTML Pages'])) # Create urllib3 pool to download each web page http_pool = urllib3.PoolManager() main_url = my_tools.get_web_page(page_name='Item', path='/Items', http_pool=http_pool) # For formatting my_tools.log_status('\n') # Use the item page and set up parsing with my_globals.bs4_lock: item_grid_html = BeautifulSoup(markup=main_url, features='lxml') # Find the item grid and start to parse finished_items_html = item_grid_html.find(id='item-grid') # Loop through item grid for each item section for cnt, null in enumerate(finished_items_html.contents): # Add section to dictionary if cnt % 4 == 1: # Save current section being worked on category = finished_items_html.contents[cnt].text.strip() # Skip sections not used by calculator if category == 'Potions and Consumables' or \ category == 'Distributed' or \ category == 'Removed items' or \ category == 'Trinkets': continue # Log status of program my_tools.log_status(''.join([ 'Starting Section: ', finished_items_html.contents[cnt].text.strip() ])) # Create entry for current section in global dictionary my_globals.item_info[ finished_items_html.contents[cnt].text.strip()] = {} # Search though section for items if cnt % 4 == 3: # Save current section being worked on category = finished_items_html.contents[cnt - 2].text.strip() # Skip sections not used by calculator if category == 'Potions and Consumables' or \ category == 'Distributed' or \ category == 'Removed items' or \ category == 'Trinkets': continue # Array to hold threads all_item_threads = [] # Get the page for each item in the category and start to parse for item in finished_items_html.contents[cnt]: # Save item path and readable names item_name = item.contents[0].contents[0].contents[0].get( 'href') current_item_name = item_name[6:].replace('%27', '\'').replace( '_', ' ') # Create thread for each item being parsed while True: # Only create a thread if limit has not been exceeded if my_globals.thread_count < my_globals.thread_max: # Signal a new thread is being created with my_globals.counter_lock: my_globals.thread_count += 1 # Create thread and process each item thread = threading.Thread(target=get_item_page, args=(item, cnt, finished_items_html, category, http_pool), name=current_item_name) # Append current thread to list and start thread all_item_threads.append(thread) thread.start() # Exit loop once processing is done break # Wait if a thread queue is full time.sleep(2) # break # Wait for all threads to finish processing for thread in all_item_threads: thread.join() # For formatting my_tools.log_status('\n') #FOR DEBUGGING, STOP AFTER FIRST SECTION # break #FOR DEBUGGING, CREATE LOCAL COPY AS GLOBAL VARIABLE DOES NOT SHOW UP IN THE DEBUGGER temp = my_globals.item_info.copy() return