def show_image(canvas, images, card_name): search = 'yugioh+' + card_name.replace(' ', '+') scraper = BeautifulScraper() scraper.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36' ) url = 'https://www.google.co.in/search?q=' + search + '&source=lnms&tbm=isch' page = scraper.go(url) image_url = json.loads(page.find_all('div', {'class': 'rg_meta'})[0].text)['ou'] print(image_url) image_bytes = urlopen(image_url).read() pil_image = Image.open(BytesIO(image_bytes)) w, h = pil_image.size ratio = 300 / h pil_image = pil_image.resize((int(w * ratio), int(h * ratio)), Image.ANTIALIAS) images[0] = ImageTk.PhotoImage(pil_image) # create_image(xpos, ypos, image, anchor) canvas.create_image(150, 150, image=images[0], anchor='center') canvas.pack(expand='yes', side='top')
def get_actors(game_name, imdb_url): # Get the service resource. new_url = resolve_url(imdb_url) print(new_url) dynamodb = boto3.resource('dynamodb') table = dynamodb.Table('game_titles') response = table.put_item(Item={ 'game_name': game_name, }) table = dynamodb.Table('all_games') print(new_url) # Initialize scraper scraper = BeautifulScraper() soup = scraper.go(str(imdb_url)) time.sleep(10) cast = soup.select("table.cast_list")[0] rows = cast.find_all('tr') del rows[0] # Iterate through rows for tr in rows: cols = tr.findAll('td') put_item_flag = True if len(cols) < 4: pass else: actor_name = ' '.join(cols[1].find('a').text.split()) character_name = cols[3].find('div').text.split() if '(voice)' in character_name: character_name.remove('(voice)') if '(uncredited)' in character_name: character_name.remove('(uncredited)') character_name = ' '.join(character_name) if character_name == '': character_name = 'Unknown' if actor_name == '': actor_name = 'Unknown' # Write to Dynamo try: print(game_name, actor_name, character_name) response = table.put_item( Item={ 'game_name': game_name, 'actor_name': actor_name, 'character_name': character_name, }) except: print tr
def maclookup(macaddress): # Now parsing the MAC to get rid of special characters macaddress = re.findall('[\d\w]*', macaddress) mac = '' for i in macaddress: mac = mac + i # Now putting in a one second timeout so that we do not violate the terms # Of the API object (one per second, 1000 per day) time.sleep(1) # Now getting the mac url = 'https://api.macvendors.com/' + mac scraper = BeautifulScraper() body = scraper.go(url) results = str(body) return results
def collect_emoji(storage): ''' This function collects photos from webfx.com/tools/emoji-cheat-sheet. :param storage: location for emojis to be stored :return: 1 if successful, 0 otherwise ''' bs = BeautifulScraper() url = "https://www.webfx.com/tools/emoji-cheat-sheet/" page = bs.go(url) if(page): count = 0 print("Beginning Scrape.") for emoji in page.find_all("span",{"class":"emoji"}): image_path = emoji['data-src'] a,b,c = image_path.split("/") urllib.request.urlretrieve(url+image_path, storage + c)
def get_emojis(url, output_path): """ Name: get_emojis Description: Navigates to the url and scrapes the page for all emojis Params: url - the url hosting all the emoji images output_path - the location to save each emoji image Returns: None """ # Load scraper data bs = BeautifulScraper() soup = bs.go(url) emojis = soup.find_all("span", {"class": "emoji"}) # To calculate completion % total = len(emojis) count = 1 for emoji in emojis: # Get each images data image_path = emoji['data-src'] file_name = get_file_name(image_path) # Check image url for validity r = requests.get(url + image_path, allow_redirects=True) # If valid if r.status_code == 200: with open(output_path + file_name, 'wb') as f: for chunk in r: f.write(chunk) print('{0}/{1}: Success \t{2:.2f}%'.format( count, len(emojis), (count / total) * 100)) # If invalid else: print('{0}/{1}: Failure \t{2:.2f}%'.format( count, len(emojis), (count / total) * 100)) count += 1 sleep(0.02) print('{0} emojis saved to {1}'.format(count, output_path))
def urban_dict(term): scraper = BeautifulScraper() url = "https://www.urbandictionary.com/define.php?term=%s" % (term) page = scraper.go(url) def_div = page.find("div", {'class': 'meaning'}) definition = "" #for loop strips html tags, also removes carriage return if (def_div is not None): for container in def_div: t = str(container) t = re.sub('<.*?>', ' ', t) t = t.rstrip() definition += t print("\n" + re.sub('\+', ' ', term) + ": \n" + definition + "\n") else: print("term not found(check case)")
def getUrls(self, tag = None): scraper = BeautifulScraper() site = "https://salttiger.com/" + ("" if tag == None else "tag/%s/" % tag.lower()) body = scraper.go(site) articles = body.select('article') for article in articles: self.parse_meta_info(article) totalPages = body.select('div.wp-pagenavi span.pages')[0].text pattern = re.compile(r'(\d+)') counts = int(re.findall(pattern,totalPages)[-1]) for i in range(2,counts + 1): url = site + ("page/%d/" % i) body = scraper.go(url) articles = body.select('article') for article in articles: self.parse_meta_info(article)
from beautifulscraper import BeautifulScraper scraper = BeautifulScraper() url = "https://ifunny.co" page = scraper.go(url) # find all of the links to each category (other pages) # they happen to be in line item (li) tags for li in page.find_all("li", {"class": "categories-list__item"}): print(li) print(li.a) print(li.a['href']) print("%s%s" % (url, li.a['href'])) sub_url = url + li.a['href'] sub_page = scraper.go(sub_url) for sli in sub_page.find_all("li", {"class": "categories-list__item"}): print(sli.a['href'])
import os import sys import json from beautifulscraper import BeautifulScraper from time import sleep from pprint import pprint # Get input file from terminal if len(sys.argv) != 3: print('Enter start and end year to run script. Ex. "python3 nfl_scrape_gameids.py 2009 2019"') else: #load scraper data and urls bs = BeautifulScraper() schedule_url = 'http://www.nfl.com/schedules/' game_url = 'http://www.nfl.com/liveupdate/game-center/' startYear = int(sys.argv[1]) endYear = int(sys.argv[2]) years = [x for x in range(startYear, endYear)] preWeeks = [x for x in range(1, 5)] # to include HOF, make (0,5) regWeeks = [x for x in range(1, 18)] #postWeeks = Nothing due to web structure. All weeks on one page. gameids = { 'PRE': {}, 'REG': {}, 'POST': {}
Name: Jakob Lopez Description: NFL provides JSON data that contains detailed game information for every single game played. This program collects every JSON item for each game from 2009 to 2018 and places them in a folder. Using knowledge of the NFL website URL, the game-id for every game is collected and used to access the JSON items """ from beautifulscraper import BeautifulScraper from pprint import pprint import json import urllib import requests import sys scraper = BeautifulScraper() #Years 2009 to 2018 years = list(range(2009, 2019)) #Week 1 to 17 weeks = list(range(1, 18)) #Dictionary of REG & POST keys that have list values gameIDs = {'REG': [], 'POST': []} #Opens a file for writing f = open("gameIDs.json", "w") """ Name: scrape_data Description:
# -*- coding: utf-8 –*- __author__ = 'Hu Wenchao' from beautifulscraper import BeautifulScraper import re PROBLEM_NUMBER = 480 scraper = BeautifulScraper() # 题目信息获取 #for number in xrange(1, PROBLEM_NUMBER+1): number = 99 url = "https://projecteuler.net/problem=%d" % number soup = scraper.go(url) title = soup.h2.get_text() # 获取题目标题 # problem_description = soup.find(role="problem").get_text() # print problem_description # 获取题目信息
Name: Buddy Smith Description: Using beautifulscraper, football game ids are scraped form NFL.com. The IDs are stored in the dict gameids. """ from beautifulscraper import BeautifulScraper from pprint import pprint import os import json from time import sleep beauty = BeautifulScraper() url = "http://www.nfl.com/schedules/" years = list(range(2009, 2019)) weeks = list(range(1, 18)) # 17 weeks in season preWeeks = list(range(1, 5)) postWeeks = list(range(1,2)) gameids = {'PRE':{}, 'REG':{},'POST':{}} for year in years: gameids["PRE"][year] = {} for preWeek in preWeeks: gameids['PRE'][year][preWeek] =[] newURL = url + "%d/PRE%d" %(year,preWeek) #create new URL page = beauty.go(newURL) #go to new url contents = page.find_all('div',{'class':'schedules-list-content'}) #collect contents
#!/usr/local/bin/env python3 from beautifulscraper import BeautifulScraper scraper = BeautifulScraper() body = scraper.go("https://github.com/adregner/beautifulscraper") body.select(".repository-meta-content")[0].text
from beautifulscraper import BeautifulScraper from time import sleep import sys import json from pprint import pprint import urllib scraper = BeautifulScraper() def get_category_links(data): categories = [] for li in data.find_all("li", {"class": "categories-list__item"}): categories.append(url + li.a['href']) return categories # If file is called directly run this block if __name__ == '__main__': url = 'https://ifunny.co' with open('meme_links.json') as f: data = json.load(f) pprint(data) page_nums = [x for x in range(5)] for num in page_nums: num += 1
import urllib.request from urllib.error import URLError, HTTPError, ContentTooShortError #url = 'https://www.google.com/search?q=test' from beautifulscraper import BeautifulScraper scraper = BeautifulScraper() import re #url = 'http://example.webscraping.com/view/Brazil-3' #html = (url) body = scraper.go("http://example.webscraping.com/view/Brazil-3") #re.findall(r'<td class="w2p_fw">(.*?)</td>',body) first = body.findAll(r'<td class="w2p_fw">(.*?)</td>', body) #print (body) #print(body.title) print(first) print(first)
Github username: dcortez0817 Repo url: https://github.com/dcortez0817/4883-SWTools-Cortez Name: Darien Cortez Description: This program scrapes the game ids from NFL.com using the beautiful scraper tool to scrape the data from the website. It then places the game ids in a json file to allow us to scrape all individual game stats from 2009 to 2019 and places those stats in a json file. """ from beautifulscraper import BeautifulScraper from pprint import pprint #pretty print import urllib #scrapes url from sites import json #allows Javascript notation scraper = BeautifulScraper() #variable for scraping data f = open("g_IDs.json", 'w') # file to hold the game ids gameids = {'REG': [], 'POST': []} """ season_point(season, year, week = "None"): This function checks which point of the season users are in, puts that information in the url, scrapes the game ids from that season, places the data in a file, and prints the completion of that task Params: season [string] : tells the point you are in the season year [int] : the year you are in week [int] : equal to none because the weeks are only needed
dest="addrs", default=url1) parser.add_option("-t", "--arch", type="string", help=msg2, dest="arch", default="amd64") options, arguments = parser.parse_args() ''' The following block of code fetches information from the HTML from the base URL. ''' __data__ = {} if (options.addrs): scraper = BeautifulScraper() body = scraper.go(options.addrs) __data__[options.addrs] = {} __bucket__ = __data__[options.addrs] anchors = body.find_all('a') for anchor in anchors: href = anchor.attrs.get('href', '') if (len(href) > 0) and (href not in ['../']): __is__ = href.find('.') and (len(href.split('.')) == 2) __matches__ = (not __is__) and (href.find(options.arch) > -1) if (__matches__): __bucket__[href] = {'href': href, 'is_filename': __is__} '''
Assignemt: A03 Date: 2/06/19 Github username: jeremyglebe Repo url: https://github.com/jeremyglebe/4883-SWTools-Glebe Name: Jeremy Glebe Description: Scrapes NFL game ids so that those ids can later be used to scrape data """ import json from beautifulscraper import BeautifulScraper from time import sleep from random import random as rnd #Scraper object scraper = BeautifulScraper() #Year and week ranges years = [x for x in range(2009, 2019)] weeks = [x for x in range(1, 19)] #Object to store game ids game_ids = {} #For each year we are getting data from for year in years: #Initialize the year game_ids[year] = {} #Week by week for week in weeks: #Initialize the list of games for that week game_ids[year][week] = [] #Get the correct url for the week
""" Course: CMPS 4883 Assignemt: A06 Date: 3/09/19 Github username: dcortez0817 Repo url: https://github.com/dcortez0817/4883-SWTools-Cortez Name: Darien Cortez Description: This program scrapes 877 emojis from the https://www.webfx.com/tools/emoji-cheat-sheet/ website and stores the images in the emojis folder. """ from beautifulscraper import BeautifulScraper import urllib scraper = BeautifulScraper() #variable for scraping data url = 'https://www.webfx.com/tools/emoji-cheat-sheet/' # Use beatiful soup to read the page page = scraper.go(url) #used to count emojis cnt = 0 # then loop through the page with the following for emoji in page.find_all("span",{"class":"emoji"}): image_path = emoji['data-src'].split("/") # save the image using requests library urllib.request.urlretrieve(url+emoji["data-src"], 'emojis/'+image_path[-1]) cnt+=1
from time import sleep from random import shuffle import os """ Course: cmps 4883 Assignemt: A03 Date: 2/10/19 Github username: acdczlc Repo url: https://github.com/acdczlc/4883-SWTools-Conley Name: Zac Conley Description: scrapes ids from internet """ sleeper = .01 #set sleep timer to prevent over requesting server scraper = BeautifulScraper() #initialize scraper sch = "http://www.nfl.com/schedules/" #url of schedules firstyear = 2009 #first year searching lastyear = 2019 #last year to search (up to year before) preseason = [x for x in range(0, 5)] regseason = [x for x in range(1, 18)] years = [x for x in range(firstyear, lastyear) ] #sets ranges for years and weeks of season print("Fetching all gameids from " + str(firstyear) + "-" + str(firstyear + 1) + " to " + str(lastyear - 1) + "-" + str(lastyear)) print("This will take several minutes, please be patient.") #user message print("This program will let you know when it is done.") gameids = { #3 types in a season "preseason": {}, "regular_season": {}, "playoffs": {}
# -*- coding: utf-8 -*- # """ This file enables database populate using HU scrapper. Here it connects to HU website and import some items to mongodb database. """ from beautifulscraper import BeautifulScraper import pymongo import os import time import ConfigParser config = ConfigParser.RawConfigParser() config.read(os.path.join(os.environ['ROOT_DIR'], 'populate', 'populate.cfg')) scraper = BeautifulScraper() # Setting constants from config file and environment HOTEL_FIELDS = config.options('hotel_fields') MAXCONNECTIONS = config.getint('default', 'maxconnections') URL_SCRAP = config.get('default', 'url_scrap') MAXPAGES = config.getint('default', 'maxpages') + 1 MONGO_HOST = os.environ['MONGO_HOST'] MONGO_PORT = os.environ['MONGO_PORT'] MONGO_DBNAME = os.environ['MONGO_DBNAME'] MONGO_USERNAME = os.environ['MONGO_USERNAME'] MONGO_PASSWORD = os.environ['MONGO_PASSWORD'] # Wait mongodb up to connect for i in xrange(MAXCONNECTIONS): try: