def processWallpaper(url): wallpaperSoup = getSoup(url) wallpaperOriginalUrl = wallpaperSoup.find('span', { "class" : "btn btn-success download-button" })['data-href'] sys.stdout.write("\t\tOriginal Wallpaper Url: " + wallpaperOriginalUrl + "\n\t\t\t") categories = wallpaperSoup.find('div', { "class" : "floatright" }).findAll('strong') name = wallpaperSoup.find('div', {'class': 'container center'}).find('div').text.strip().replace("/",".") tags = wallpaperSoup.findAll('div', {'style': 'padding:5px 10px; margin:1px; display:inline-block;'}) tagArray = [None]*len(tags) taglist = "" index = 0 if len(tags) > 0: for tag in tags: tagArray[index] = tag.text.strip() index += 1 tagArray.sort() for tag in tagArray: taglist += "[" + tag + "]" fileName = taglist + name + ((" " if len(taglist) > 0 else "") if len(name) == 0 else " - ") + wallpaperOriginalUrl.split('/')[-4] + "." + wallpaperOriginalUrl.split('/')[-2] directoryStructure = baseDir for i in range(0, len(categories)): sys.stdout.write(categories[i].text.strip() + ("" if i == (len(categories) - 1) else " => ")) directoryStructure += categories[i].text.strip() + "/" sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName + "\n") ensureDir(directoryStructure) retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t", fileName) if int(retval) == 42 and update: global stop stop = True
def dlForDate(singleDate): print("Getting Word of the Day for: " + singleDate.strftime("%Y/%m/%d")) wordSoup = getSoup("http://www.dictionary.com/wordoftheday/" + singleDate.strftime("%Y/%m/%d") + "/") url = wordSoup.find('meta', {"property": "og:image"})['content'] print("\tDownloading:" + url) fileDl(url, sys.argv[1], "\t\t")
def processWallpaper(url): wallpaperSoup = getSoup(url) wallpaperOriginalUrl = wallpaperSoup.find( 'span', {"class": "btn btn-success download-button"})['data-href'] sys.stdout.write("\t\tOriginal Wallpaper Url: " + wallpaperOriginalUrl + "\n\t\t\t") categories = wallpaperSoup.find('div', { "class": "floatright" }).findAll('strong') fileName = wallpaperOriginalUrl.split( '/')[-4] + "." + wallpaperOriginalUrl.split('/')[-2] directoryStructure = baseDir for i in range(0, len(categories)): sys.stdout.write(categories[i].text.strip() + ("" if i == (len(categories) - 1) else " => ")) directoryStructure += categories[i].text.strip() + "/" sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName + "\n") ensureDir(directoryStructure) retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t", fileName) if int(retval) == 42 and update: global stop stop = True
opts, args = getopt.getopt(sys.argv[1:], 'hi:') except getopt.GetoptError: usage() sys.exit(2) for opt, arg in opts: if opt == '-h': usage() sys.exit() elif opt == '-i': imageFile = arg else: print("Unsupported option and/or argument") sys.exit(2) print("Input file is: " + imageFile) iqdbSoup = getSoup("http://iqdb.org/", {}, {'file': open(imageFile, 'rb')}) #print(iqdbSoup.find('div', {'class': 'pages'}).prettify()) for result in iqdbSoup.find('div', {'class': 'pages'}).findAll('table'): t1 = result.findAll('tr')[0].findAll('th')[0].text if t1 != "Your image": #print(result.prettify()) print("Image Info:") print("\t" + t1) t2 = result.find('td', {'class': 'image'}).find('a')['href'] if t2[:2] == "//": t2 = "http:" + t2 print("\t\tSource:\t\t" + t2) t3 = result.find('img', {'class': 'service-icon'}).nextSibling print("\t\tSource Page:\t" + t3) whs = result.findAll('tr')[3].find('td').text.split(' ') width = int(whs[0].split('×')[0])
#!/usr/bin/env python import sys from bs4 import BeautifulSoup from functions import getSoup, fileDl, ensureDir from datetime import datetime baseDir = "/root/econtalk.org/" baseUrl = "http://www.econtalk.org/" archiveSoup = getSoup(baseUrl + "archives.html") tableRows = archiveSoup.find( 'div', { 'class': 'archive-individual archive-date-based archive' }).findAll('tr') for tableRow in tableRows: if tableRows.index(tableRow) == 0: continue date = datetime.strptime( tableRow.find('td', { 'width': '5%' }).text.strip(), "%Y/%m/%d") extra = len(tableRow.findAll('td')[2].text.strip()) != 0 name = tableRow.find('a').text dirName = date.strftime("%Y-%m-%d") + (" Extra " if extra else " ") + "- " + name + "/" url = tableRow.find('a')['href'] ensureDir(baseDir + dirName) print(dirName[:-1]) if not extra: podcastSoup = getSoup(url) url1 = podcastSoup.find('a', text="Download")['href']
" next status should be " + str(shouldBe) + ", in/decrement will be " + str(newIncr)) if increment == 1: if lastStatus == 200: if getStatus(baseUrl + str(last + 1) + "/") == 404: return last newIncr = 1 return findLastPage(newIncr, lastPlusInc, newStatus) print("Invoking findLastPage()") lastPage = findLastPage() print("Last page is " + str(lastPage)) for page in range(0, lastPage + 1): pageSoup = getSoup("http://thechive.com/page/" + str(page) + "/") print("Page " + str(page) + " of " + str(lastPage)) for article in pageSoup.findAll('article', {"role": "article"}): date = article.find('time').text.strip() h3 = article.find('h3', {"class": "post-title entry-title card-title"}) name = h3.text.strip() url = h3.find('a')['href'] if any(x in name for x in filter): print("\tName: " + name + "\n\t\tDate: " + date) dateFolder = "NonParsable/" try: dateFolder = datetime.strptime( date, '%b %d, %Y').strftime("%Y/%m/%d/") except ValueError: print("\t\tGoing to NonParsable folder") ensureDir(baseDir + dateFolder + name + "/")
index = 0 if len(tags) > 0: for tag in tags: tagArray[index] = tag.text.strip() index += 1 tagArray.sort() for tag in tagArray: taglist += "[" + tag + "]" fileName = taglist + name + ((" " if len(taglist) > 0 else "") if len(name) == 0 else " - ") + wallpaperOriginalUrl.split('/')[-4] + "." + wallpaperOriginalUrl.split('/')[-2] directoryStructure = baseDir for i in range(0, len(categories)): sys.stdout.write(categories[i].text.strip() + ("" if i == (len(categories) - 1) else " => ")) directoryStructure += categories[i].text.strip() + "/" sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName + "\n") ensureDir(directoryStructure) retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t", fileName) if int(retval) == 42 and update: global stop stop = True wallSoup = getSoup(baseUrl + "0") totalPages = int(wallSoup.find('ul', { "class" : "pagination pagination" }).findAll('li')[-1].find('a')['href'].split('=')[1]) for i in range(0, totalPages+1): print("Scraping page " + str(i) + "...") for thumbContainer in getSoup(baseUrl + str(i)).findAll('div', { "class" : "thumb-container-big " }): wallpaperUrl = bUrl + thumbContainer.find('a')['href'] print ("\tbig.php url: " + wallpaperUrl) processWallpaper(wallpaperUrl) if stop: sys.exit(420)
elif o in ("-h", "--help"): usage() sys.exit() elif o == "--from": startDoc = int(a) elif o == "--to": endDoc = int(a) elif o == "--save-dir": baseDir = a else: usage() assert False, "unhandled option" sys.exit() ensureDir(baseDir) documentSoup = getSoup(baseUrl + "0") documentTotal = int( documentSoup.find('div', { "id": "PageRange" }).text.split('of')[1].strip().replace(',', '')) print str(documentTotal) + " documents to download. Let's get started!" if endDoc == -1: endDoc = documentTotal documentNum = 1 + roundDownTo(startDoc, 50) print( "Scanning range from " + str(roundDownTo(startDoc, 50)) + " to " + str(documentTotal if roundUpTo(endDoc, 50) > documentTotal else roundUpTo( endDoc, 50))) for i in range( roundDownTo(startDoc, 50),