Python BeautifulScraper Examples, beautifulscraper.BeautifulScraper Python Examples

Example #1

0

Show file

File: generator.py Project: jeremyglebe/dor_pw_generator

def show_image(canvas, images, card_name):

    search = 'yugioh+' + card_name.replace(' ', '+')

    scraper = BeautifulScraper()
    scraper.add_header(
        'User-Agent',
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36'
    )
    url = 'https://www.google.co.in/search?q=' + search + '&source=lnms&tbm=isch'
    page = scraper.go(url)

    image_url = json.loads(page.find_all('div',
                                         {'class': 'rg_meta'})[0].text)['ou']
    print(image_url)
    image_bytes = urlopen(image_url).read()
    pil_image = Image.open(BytesIO(image_bytes))
    w, h = pil_image.size
    ratio = 300 / h
    pil_image = pil_image.resize((int(w * ratio), int(h * ratio)),
                                 Image.ANTIALIAS)
    images[0] = ImageTk.PhotoImage(pil_image)
    # create_image(xpos, ypos, image, anchor)
    canvas.create_image(150, 150, image=images[0], anchor='center')
    canvas.pack(expand='yes', side='top')

Example #2

0

Show file

def get_actors(game_name, imdb_url):
    # Get the service resource.
    new_url = resolve_url(imdb_url)
    print(new_url)
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('game_titles')
    response = table.put_item(Item={
        'game_name': game_name,
    })

    table = dynamodb.Table('all_games')
    print(new_url)
    # Initialize scraper
    scraper = BeautifulScraper()
    soup = scraper.go(str(imdb_url))
    time.sleep(10)
    cast = soup.select("table.cast_list")[0]
    rows = cast.find_all('tr')
    del rows[0]

    # Iterate through rows
    for tr in rows:
        cols = tr.findAll('td')
        put_item_flag = True
        if len(cols) < 4:
            pass
        else:
            actor_name = ' '.join(cols[1].find('a').text.split())
            character_name = cols[3].find('div').text.split()
            if '(voice)' in character_name:
                character_name.remove('(voice)')
            if '(uncredited)' in character_name:
                character_name.remove('(uncredited)')
            character_name = ' '.join(character_name)
            if character_name == '':
                character_name = 'Unknown'
            if actor_name == '':
                actor_name = 'Unknown'

        # Write to Dynamo
        try:
            print(game_name, actor_name, character_name)
            response = table.put_item(
                Item={
                    'game_name': game_name,
                    'actor_name': actor_name,
                    'character_name': character_name,
                })
        except:
            print tr

Example #3

0

Show file

def maclookup(macaddress):
    # Now parsing the MAC to get rid of special characters
    macaddress = re.findall('[\d\w]*', macaddress)
    mac = ''
    for i in macaddress:
        mac = mac + i
    # Now putting in a one second timeout so that we do not violate the terms
    # Of the API object (one per second, 1000 per day)
    time.sleep(1)
    # Now getting the mac
    url = 'https://api.macvendors.com/' + mac
    scraper = BeautifulScraper()
    body = scraper.go(url)
    results = str(body)
    return results

Example #4

0

Show file

def collect_emoji(storage):
    '''
    This function collects photos from webfx.com/tools/emoji-cheat-sheet.
    :param storage: location for emojis to be stored
    :return: 1 if successful, 0 otherwise
    '''
    bs = BeautifulScraper()
    url = "https://www.webfx.com/tools/emoji-cheat-sheet/"

    page = bs.go(url)
    if(page):
        count = 0
        print("Beginning Scrape.")
        for emoji in page.find_all("span",{"class":"emoji"}):
            image_path = emoji['data-src']
            a,b,c = image_path.split("/")
            urllib.request.urlretrieve(url+image_path, storage + c)

Example #5

0

Show file

def get_emojis(url, output_path):
    """
    Name:
        get_emojis
    Description:
        Navigates to the url and scrapes the page for all emojis
    Params:
        url - the url hosting all the emoji images
        output_path - the location to save each emoji image
    Returns:
        None
    """

    # Load scraper data
    bs = BeautifulScraper()
    soup = bs.go(url)
    emojis = soup.find_all("span", {"class": "emoji"})

    # To calculate completion %
    total = len(emojis)
    count = 1

    for emoji in emojis:
        # Get each images data
        image_path = emoji['data-src']
        file_name = get_file_name(image_path)

        # Check image url for validity
        r = requests.get(url + image_path, allow_redirects=True)

        # If valid
        if r.status_code == 200:
            with open(output_path + file_name, 'wb') as f:
                for chunk in r:
                    f.write(chunk)
            print('{0}/{1}: Success  \t{2:.2f}%'.format(
                count, len(emojis), (count / total) * 100))
        # If invalid
        else:
            print('{0}/{1}: Failure  \t{2:.2f}%'.format(
                count, len(emojis), (count / total) * 100))

        count += 1
        sleep(0.02)

    print('{0} emojis saved to {1}'.format(count, output_path))

Example #6

0

Show file

def urban_dict(term):
    scraper = BeautifulScraper()

    url = "https://www.urbandictionary.com/define.php?term=%s" % (term)

    page = scraper.go(url)
    def_div = page.find("div", {'class': 'meaning'})

    definition = ""
    #for loop strips html tags, also removes carriage return
    if (def_div is not None):
        for container in def_div:
            t = str(container)
            t = re.sub('<.*?>', ' ', t)
            t = t.rstrip()
            definition += t

        print("\n" + re.sub('\+', ' ', term) + ": \n" + definition + "\n")
    else:
        print("term not found(check case)")

Example #7

0

Show file

File: crawler.py Project: wangzhizhou/SaltTigerCrawler

    def getUrls(self, tag = None):
        scraper = BeautifulScraper()
        site = "https://salttiger.com/" + ("" if tag == None else "tag/%s/" % tag.lower())
        body = scraper.go(site)  
        
        articles = body.select('article')   
        for article in articles:
            self.parse_meta_info(article)

        totalPages = body.select('div.wp-pagenavi span.pages')[0].text
        pattern = re.compile(r'(\d+)')
        counts = int(re.findall(pattern,totalPages)[-1])  

        for i in range(2,counts + 1):

            url = site + ("page/%d/" % i)
            body = scraper.go(url)
            articles = body.select('article')

            for article in articles:
                self.parse_meta_info(article)

Example #8

0

Show file

from beautifulscraper import BeautifulScraper
scraper = BeautifulScraper()

url = "https://ifunny.co"
page = scraper.go(url)

# find all of the links to each category (other pages)
# they happen to be in line item (li) tags
for li in page.find_all("li", {"class": "categories-list__item"}):
    print(li)
    print(li.a)
    print(li.a['href'])
    print("%s%s" % (url, li.a['href']))
    sub_url = url + li.a['href']
    sub_page = scraper.go(sub_url)

    for sli in sub_page.find_all("li", {"class": "categories-list__item"}):
        print(sli.a['href'])

Example #9

0

Show file

File: nfl_scrape_gameids.py Project: briceallard/4883-SWTools-Allard

import os
import sys
import json
from beautifulscraper import BeautifulScraper
from time import sleep
from pprint import pprint

# Get input file from terminal
if len(sys.argv) != 3:

    print('Enter start and end year to run script. Ex. "python3 nfl_scrape_gameids.py 2009 2019"')

else:
    #load scraper data and urls
    bs = BeautifulScraper()

    schedule_url = 'http://www.nfl.com/schedules/'
    game_url = 'http://www.nfl.com/liveupdate/game-center/'

    startYear = int(sys.argv[1])
    endYear = int(sys.argv[2])

    years = [x for x in range(startYear, endYear)]
    preWeeks = [x for x in range(1, 5)]  # to include HOF, make (0,5)
    regWeeks = [x for x in range(1, 18)]
    #postWeeks = Nothing due to web structure. All weeks on one page.

    gameids = {
        'PRE': {},
        'REG': {},
        'POST': {}

Example #10

0

Show file

Name: Jakob Lopez
Description: 
    NFL provides JSON data that contains detailed game information
    for every single game played. This program collects every JSON
    item for each game from 2009 to 2018 and places them in a folder.
    Using knowledge of the NFL website URL, the game-id for every game
    is collected and used to access the JSON items
"""
from beautifulscraper import BeautifulScraper
from pprint import pprint
import json
import urllib
import requests
import sys

scraper = BeautifulScraper()

#Years 2009 to 2018
years = list(range(2009, 2019))

#Week 1 to 17
weeks = list(range(1, 18))

#Dictionary of REG & POST keys that have list values
gameIDs = {'REG': [], 'POST': []}

#Opens a file for writing
f = open("gameIDs.json", "w")
"""
Name: scrape_data
Description:

Example #11

0

Show file

File: download_problems.py Project: huwenchao/ProjectEuler

# -*- coding: utf-8 –*-
__author__ = 'Hu Wenchao'

from beautifulscraper import BeautifulScraper
import re
PROBLEM_NUMBER = 480
scraper = BeautifulScraper()

# 题目信息获取
#for number in xrange(1, PROBLEM_NUMBER+1):
number = 99
url = "https://projecteuler.net/problem=%d" % number
soup = scraper.go(url)
title = soup.h2.get_text()	# 获取题目标题
# problem_description = soup.find(role="problem").get_text()
# print problem_description	# 获取题目信息

Example #12

0

Show file

Name: Buddy Smith
Description: 
    Using beautifulscraper, football game ids are scraped form NFL.com. The IDs are stored
    in the dict gameids.

"""

from beautifulscraper import BeautifulScraper
from pprint import pprint

import os
import json
from time import sleep


beauty = BeautifulScraper()
url = "http://www.nfl.com/schedules/"
years = list(range(2009, 2019))
weeks = list(range(1, 18)) # 17 weeks in season
preWeeks = list(range(1, 5))
postWeeks = list(range(1,2))
gameids = {'PRE':{}, 'REG':{},'POST':{}}


for year in years:
    gameids["PRE"][year] = {}
    for preWeek in preWeeks:
        gameids['PRE'][year][preWeek] =[]
        newURL = url + "%d/PRE%d" %(year,preWeek)           #create new URL
        page = beauty.go(newURL)                            #go to new url
        contents = page.find_all('div',{'class':'schedules-list-content'}) #collect contents

Example #13

0

Show file

#!/usr/local/bin/env python3
from beautifulscraper import BeautifulScraper
scraper = BeautifulScraper()

body = scraper.go("https://github.com/adregner/beautifulscraper")
body.select(".repository-meta-content")[0].text

Example #14

0

Show file

from beautifulscraper import BeautifulScraper
from time import sleep
import sys
import json
from pprint import pprint
import urllib

scraper = BeautifulScraper()


def get_category_links(data):
    categories = []
    for li in data.find_all("li", {"class": "categories-list__item"}):
        categories.append(url + li.a['href'])

    return categories


# If file is called directly run this block
if __name__ == '__main__':
    url = 'https://ifunny.co'

    with open('meme_links.json') as f:
        data = json.load(f)

    pprint(data)

    page_nums = [x for x in range(5)]

    for num in page_nums:
        num += 1

Example #15

0

Show file

File: c80.py Project: rushabhsooni/python-projects-

import urllib.request
from urllib.error import URLError, HTTPError, ContentTooShortError
#url = 'https://www.google.com/search?q=test'
from beautifulscraper import BeautifulScraper
scraper = BeautifulScraper()
import re

#url = 'http://example.webscraping.com/view/Brazil-3'
#html = (url)

body = scraper.go("http://example.webscraping.com/view/Brazil-3")
#re.findall(r'<td class="w2p_fw">(.*?)</td>',body)
first = body.findAll(r'<td class="w2p_fw">(.*?)</td>', body)
#print (body)
#print(body.title)
print(first)
print(first)

Example #16

0

Show file

Github username: dcortez0817
Repo url: https://github.com/dcortez0817/4883-SWTools-Cortez
Name: Darien Cortez
Description: 
    This program scrapes the game ids from NFL.com using the beautiful
    scraper tool to scrape the data from the website. It then places
    the game ids in a json file to allow us to scrape all individual game
    stats from 2009 to 2019 and places those stats in a json file.
"""

from beautifulscraper import BeautifulScraper
from pprint import pprint  #pretty print
import urllib  #scrapes url from sites
import json  #allows Javascript notation

scraper = BeautifulScraper()  #variable for scraping data
f = open("g_IDs.json", 'w')  # file to hold the game ids

gameids = {'REG': [], 'POST': []}
"""
season_point(season, year, week = "None"):

This function checks which point of the season users are in,
puts that information in the url, scrapes the game ids from
that season, places the data in a file, and prints the 
completion of that task

Params: 
   season [string] : tells the point you are in the season 
   year [int] : the year you are in
   week [int] : equal to none because the weeks are only needed

Example #17

0

Show file

File: read-ppa.py Project: raychorn/ppa-filter

                      dest="addrs", default=url1)

    parser.add_option("-t", "--arch", type="string",
                      help=msg2,
                      dest="arch", default="amd64")

    options, arguments = parser.parse_args()
    
    '''
    The following block of code fetches information 
    from the HTML from the base URL.
    '''

    __data__ = {}
    if (options.addrs):
        scraper = BeautifulScraper()
        body = scraper.go(options.addrs)
        
        __data__[options.addrs] = {}
        __bucket__ = __data__[options.addrs]
        
        anchors = body.find_all('a')
        for anchor in anchors:
            href = anchor.attrs.get('href', '')
            if (len(href) > 0) and (href not in ['../']):
                __is__ = href.find('.') and (len(href.split('.')) == 2)
                __matches__ = (not __is__) and (href.find(options.arch) > -1)
                if (__matches__):
                    __bucket__[href] = {'href': href, 'is_filename': __is__}
                    
        '''

Example #18

0

Show file

Assignemt: A03
Date: 2/06/19
Github username: jeremyglebe
Repo url: https://github.com/jeremyglebe/4883-SWTools-Glebe
Name: Jeremy Glebe
Description: 
    Scrapes NFL game ids so that those ids can later be used to scrape data
"""

import json
from beautifulscraper import BeautifulScraper
from time import sleep
from random import random as rnd

#Scraper object
scraper = BeautifulScraper()
#Year and week ranges
years = [x for x in range(2009, 2019)]
weeks = [x for x in range(1, 19)]
#Object to store game ids
game_ids = {}

#For each year we are getting data from
for year in years:
    #Initialize the year
    game_ids[year] = {}
    #Week by week
    for week in weeks:
        #Initialize the list of games for that week
        game_ids[year][week] = []
        #Get the correct url for the week

Example #19

0

Show file

File: emojigrabber.py Project: dcortez0817/4883-SWTools-Cortez

"""
Course: CMPS 4883
Assignemt: A06
Date: 3/09/19
Github username: dcortez0817
Repo url: https://github.com/dcortez0817/4883-SWTools-Cortez
Name: Darien Cortez
Description: 
    This program scrapes 877 emojis from the https://www.webfx.com/tools/emoji-cheat-sheet/
    website and stores the images in the emojis folder.
"""
from beautifulscraper import BeautifulScraper
import urllib

scraper = BeautifulScraper() #variable for scraping data

url = 'https://www.webfx.com/tools/emoji-cheat-sheet/'

# Use beatiful soup to read the page
page = scraper.go(url)

#used to count emojis
cnt = 0

# then loop through the page with the following
for emoji in page.find_all("span",{"class":"emoji"}):
    image_path = emoji['data-src'].split("/")
    # save the image using requests library
    urllib.request.urlretrieve(url+emoji["data-src"], 'emojis/'+image_path[-1])
    cnt+=1

Example #20

0

Show file

File: scrape_game_ids.py Project: acdczlc/4883-SWTools-Conley

from time import sleep
from random import shuffle
import os
"""
Course: cmps 4883
Assignemt: A03
Date: 2/10/19
Github username: acdczlc
Repo url: https://github.com/acdczlc/4883-SWTools-Conley
Name: Zac Conley
Description: 
  scrapes ids from internet

"""
sleeper = .01  #set sleep timer to prevent over requesting server
scraper = BeautifulScraper()  #initialize scraper
sch = "http://www.nfl.com/schedules/"  #url of schedules
firstyear = 2009  #first year searching
lastyear = 2019  #last year to search (up to year before)
preseason = [x for x in range(0, 5)]
regseason = [x for x in range(1, 18)]
years = [x for x in range(firstyear, lastyear)
         ]  #sets ranges for years and weeks of season
print("Fetching all gameids from " + str(firstyear) + "-" +
      str(firstyear + 1) + " to " + str(lastyear - 1) + "-" + str(lastyear))
print("This will take several minutes, please be patient.")  #user message
print("This program will let you know when it is done.")
gameids = {  #3 types in a season
    "preseason": {},
    "regular_season": {},
    "playoffs": {}

Example #21

0

Show file

# -*- coding: utf-8 -*- #
""" This file enables database populate using HU scrapper.
Here it connects to HU website and import some items to
mongodb database.
"""

from beautifulscraper import BeautifulScraper
import pymongo
import os
import time
import ConfigParser

config = ConfigParser.RawConfigParser()
config.read(os.path.join(os.environ['ROOT_DIR'], 'populate', 'populate.cfg'))

scraper = BeautifulScraper()

# Setting constants from config file and environment
HOTEL_FIELDS = config.options('hotel_fields')
MAXCONNECTIONS = config.getint('default', 'maxconnections')
URL_SCRAP = config.get('default', 'url_scrap')
MAXPAGES = config.getint('default', 'maxpages') + 1
MONGO_HOST = os.environ['MONGO_HOST']
MONGO_PORT = os.environ['MONGO_PORT']
MONGO_DBNAME = os.environ['MONGO_DBNAME']
MONGO_USERNAME = os.environ['MONGO_USERNAME']
MONGO_PASSWORD = os.environ['MONGO_PASSWORD']

# Wait mongodb up to connect
for i in xrange(MAXCONNECTIONS):
    try: