def getTournaments(): import requests import re from BeautifulSoup import BeautifulSoup # https://smash.gg/tournaments?per_page=30&filter=%7B%22name%22%3A%22%22%2C%22past%22%3Atrue%2C%22upcoming%22%3Afalse%7D&page=1 maxPage = 2 # find a way to get this number urlBase = "https://smash.gg/tournaments?per_page=30&fipoopC%22past%22%3Atrue%7D&page=" for i in range(1, maxPage): url = urlBase + str(i) # Do request # try: # data = requests.get(url).text # except ChunkedEncodingError: # print("Error with tournament: " + tournament.name) # return False print url data = requests.get(url).text print data with open('requesttest.txt', 'w') as file: file.write(data) soup = BeautifulSoup(data) divs = soup.findall("div", {"class": "TournamentCardContainer"}) print len(divs)
def scrape_page(url): html = scraperwiki.scrape(url) soup = BeautifulSoup(html) #print soup.prettify() #link_table=soup.find("div", {"class" : "alphabet_list clearfix"}) profiles = soup.findall("div", { "class" : "profileFriendsText" }) #find the section where the friends are #next_link=soup.findAll("a") for profile in profiles: next_url=link['href'] print next_url
def scrape_page(url): html = scraperwiki.scrape(url) soup = BeautifulSoup(html) # print soup.prettify() # link_table=soup.find("div", {"class" : "alphabet_list clearfix"}) profiles = soup.findall("div", {"class": "profileFriendsText"}) # find the section where the friends are # next_link=soup.findAll("a") for profile in profiles: next_url = link["href"] print next_url
def printLinks(url): ab = anonBrowser() ab.anonymize() page = ab.open(url) html = page.read() try: print '[+] Printing links from regex.' link_finder = re.compile('href="(.*?)"') links = link_finder.findall(html) for link in links: print link except: pass try: print '\n[+] Printing links from BeautifulSoup.' soup = BeautifulSoup(html) links = soup.findall(name='a') for link in links: if link.has_key('href'): print link['href'] except: pass
import scraperwiki #Namespace for Scrapper wiki web site from BeautifulSoup import BeautifulSoup #Import the namespace to read web pages print "TOP 10 countries in currency " Page = scraperwiki.scrape('http://www.xe.com/') Source = BeautifulSoup(Page) scraperwiki.metadata.save('columns', ['country name ', 'currency name', 'worldrank','highest denomination till date' ,'year of currency eastablishment','trading','mobile currency site']) MainTable = Source.findall ("table", { "trading" : "1":"10" }) RowDetails = MainTable.findAll("tr") print "****Scrapping Started*****" for row in RowDetails: Dicrecord = {} #Create Dictionary to store top currency Details Columns = row.findAll("td") if Columns: Dicrecord['country name'] = Columns[0].text Dicrecord['currency name'] = Columns[1].text Dicrecord['highest denomination till date'] = Columns[2].text Dicrecord['year of currency eastablishment'] = Columns[3].text Dicrecord['trading'] = Columns[5].text Dicrecord['mobile currency site'] = Columns[8].text scraperwiki.datastore.save(["top 10 currency"], Dicrecord) print Dicrecord print "****Scrapping Complted*****" import scraperwiki #Namespace for Scrapper wiki web site from BeautifulSoup import BeautifulSoup #Import the namespace to read web pages print "TOP 10 countries in currency " Page = scraperwiki.scrape('http://www.xe.com/') Source = BeautifulSoup(Page)