def pushedbutton(self,b): account = self.lineEdit.text() pasw = self.lineEdit_3.text() #use robobrowser module to manipulate web page browser = RoboBrowser(history = True) browser.open('http://web1.cmu.edu.tw/stdinfo/login.asp') form1 = browser.get_form(id = 'form1') form1['f_id'].value = account form1['f_pwd'].value = pasw browser.submit_form(form1) if browser.state.url == "http://web1.cmu.edu.tw/stdinfo/loginerr.asp": self.lineEdit_2.setText('帳號密碼錯了?') else: link_one = browser.get_link(text = re.compile('.意見調查')) browser.follow_link(link_one) list = [] for l in browser.get_links(text = re.compile('.填寫.')): list.append(l) list.pop(0) for li in list: browser.follow_link(li) form2 = browser.get_form(id = 'thisform') form2['Cos_Q1'].value = '1' browser.submit_form(form2) self.lineEdit_2.setText('Done!')
def get_mp3_url(lecture_url): browser = RoboBrowser() browser.open(lecture_url) link = browser.get_link(href=re.compile("\\.mp3$")) if link is not None: return link["href"] else: return None
class FakeMail(object): def __init__(self): self.browser = RoboBrowser(history=True) self.browser.open('http://10minutemail.com/') with open('10minmail.txt', 'w') as f: f.write(str(self.browser.parsed)) if self.browser.get_link('Blocked'): raise BlockedException('to many login Attempts') def get_address(self): address = self.browser.find("div", {"id": "copyAddress"}) print address def read_mail(self): pass
def pushedbutton(self,b): account = self.lineEdit.text() pasw = self.lineEdit_3.text() #use robobrowser module to manipulate web page browser = RoboBrowser(history = True) browser.open('http://web1.cmu.edu.tw/stdinfo/login.asp') form1 = browser.get_form(id = 'form1') form1['f_id'].value = account form1['f_pwd'].value = pasw browser.submit_form(form1) if browser.state.url == "http://web1.cmu.edu.tw/stdinfo/loginerr.asp": self.lineEdit_2.setText('帳號密碼錯了?') else: self.lineEdit_2.setText('成功登入,填寫中....') link_one = browser.get_link(text = '教師教學意見調查') browser.follow_link(link_one) list = [] for l in browser.get_links(text = '填寫'): list.append(l) list.pop(0) for li in list: browser.follow_link(li) form2 = browser.get_form(id = 'thisform') form2['CH_1'].value = '3' form2['CH_2'].value = '3' form2['CH_3'].value = '3' form2['CH_4'].value = '3' form2['CH_5'].value = '3' form2['CH_6'].value = '3' form2['CH_7'].value = '3' form2['CH_8'].value = '3' form2['CH_9'].value = '3' form2['CH_10'].value = '3' browser.submit_form(form2) self.lineEdit_2.setText('Done!')
password = getpass() challenge_count = 1 while (True): browser = RoboBrowser(parser='lxml') browser.open(SITE_URL) # loop forever #try catch this signin_form = browser.get_forms()[0] signin_form['login'].value = username signin_form['password'].value = password browser.submit_form(signin_form) #get the leaderboard list browser.follow_link(browser.get_link(text='Leaderboard')) bot_name_tags = browser.find_all('div', {'class': 'bot-name'}); bot_name_extracter = lambda tag: tag.string.replace('\t', '').replace('\n', '').lower() bot_names = map(bot_name_extracter, bot_name_tags) no_bots = len(bot_names) our_rank = bot_names.index('cbteamname') + 1 print("[INFO] CBTeamName is ranked " + str(our_rank)) random.seed(os.urandom(8)) opponent_queue = [] #three bots with lower rank opponent_queue += ([bot_names[random.randint(our_rank + 1, no_bots - 1)], bot_names[random.randint(our_rank + 1, no_bots - 1)], bot_names[random.randint(our_rank + 1, no_bots - 1)]]) #one bot with a higher rank
# -*- coding: utf-8 -*- import re import sys from robobrowser import RoboBrowser from getpass import getpass account = input('account:') password = getpass('password:'******'http://web1.cmu.edu.tw/stdinfo/login.asp') form1 = browser.get_form(id = 'form1') form1['f_id'].value = account form1['f_pwd'].value = password browser.submit_form(form1) link_one = browser.get_link(text = '期中網路教學意見調查') browser.follow_link(link_one) list = [] for l in browser.get_links(text = re.compile('填寫')): list.append(l) list.pop(0) for li in list: browser.follow_link(li) form2 = browser.get_form(id = 'thisform') form2['Cos_Q1'].value = '1' browser.submit_form(form2)
class Dagr: """deviantArt gallery ripper class""" NAME = basename(__file__) __version__="0.60" MAX_DEVIATIONS = 1000000 # max deviations def __init__(self): # Internals self.browser = None self.errors_count = dict() # Configuration self.username = "" self.password = "" self.overwrite = False self.reverse = False self.testOnly = False self.verbose = False # Current status self.deviant = "" def start(self): if not self.browser: # Set up fake browser self.set_browser() # Always run login self.login() def set_browser(self): USERAGENTS = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/6.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1) Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)' ) session = req_session() session.headers.update({'Referer': 'http://www.deviantart.com/'}) self.browser = RoboBrowser(history=False, session=session, tries=3, user_agent=random.choice(USERAGENTS)) def login(self): if not (self.username and self.password): return print("Attempting to log in to deviantArt...") self.browser.open('https://www.deviantart.com/users/login?ref=http%3A%2F%2Fwww.deviantart.com%2F&remember_me=1') form = self.browser.get_forms()[1] form['username'] = self.username form['password'] = self.password self.browser.submit_form(form) if self.browser.find(text=re.compile("The password you entered was incorrect")): print("Wrong password or username. Attempting to download anyway.") elif self.browser.find(text=re.compile("\"loggedIn\":true")): print("Logged in!") else: print("Login unsuccessful. Attempting to download anyway.") def get(self, url, file_name = None): if file_name is not None and (self.overwrite == False) and (path_exists(file_name)): print(file_name + " exists - skipping") return #TODO Test robobrowser retries and exceptions self.browser.open(url) if file_name is None: return str(self.browser.parsed) else: # Open our local file for writing local_file = open(file_name, "wb") #Write to our local file local_file.write(self.browser.response.content) local_file.close() def find_link(self, link): filelink = None mature_error = False self.browser.open(link) # Full image link (via download link) img_link = self.browser.get_link(text=re.compile("Download( (Image|File))?")) if img_link and img_link.get("href"): self.browser.follow_link(img_link) filelink = self.browser.url else: if self.verbose: print("Download link not found, falling back to direct image") # Fallback 1: try meta (filtering blocked meta) filesearch = self.browser.find("meta", {"name":"og:image"}) if filesearch: filelink = filesearch['content'] if basename(filelink).startswith("noentrythumb-"): filelink = None mature_error = True if not filelink: # Fallback 2: try collect_rid, full filesearch = self.browser.find("img", {"collect_rid":True, "class":re.compile(".*full")}) if not filesearch: # Fallback 3: try collect_rid, normal filesearch = self.browser.find("img", {"collect_rid":True, "class":re.compile(".*normal")}) if filesearch: filelink = filesearch['src'] if not filelink: if mature_error: raise DagrException("probably a mature deviation") else: raise DagrException("all attemps to find a link failed") filename = basename(filelink) return (filename, filelink) def handle_download_error(self, link, e): error_string = str(e) print("Download error (" + link + ") : " + error_string) if error_string in self.errors_count: self.errors_count[error_string] += 1 else: self.errors_count[error_string] = 1 def deviant_get(self, mode): print("Ripping " + self.deviant + "'s " + mode + "...") pat = "http://[a-zA-Z0-9_-]*\.deviantart\.com/art/[a-zA-Z0-9_-]*" modeArg = '_' if mode.find(':') != -1: mode = mode.split(':',1) modeArg = mode[1] mode = mode[0] #DEPTH 1 pages = [] for i in range(0,int(Dagr.MAX_DEVIATIONS/24),24): html = "" url = "" if mode == "favs": url = "http://" + self.deviant.lower() + ".deviantart.com/favourites/?catpath=/&offset=" + str(i) elif mode == "collection": url = "http://" + self.deviant.lower() + ".deviantart.com/favourites/" + modeArg + "?offset=" + str(i) elif mode == "scraps": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?catpath=scraps&offset=" + str(i) elif mode == "gallery": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?catpath=/&offset=" + str(i) elif mode == "album": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/" + modeArg + "?offset=" + str(i) elif mode == "query": url = "http://" + self.deviant.lower() + ".deviantart.com/gallery/?q=" + modeArg + "&offset=" + str(i) else: continue html = self.get(url) prelim = re.findall(pat, html, re.IGNORECASE|re.DOTALL) c = len(prelim) for match in prelim: if match in pages: c -= 1 else: pages.append(match) done = re.findall("(This section has no deviations yet!|This collection has no items yet!)", html, re.IGNORECASE|re.S) if len(done) >= 1 or c <= 0: break print(self.deviant + "'s " + mode + " page " + str(int((i/24)+1)) + " crawled...") if not self.reverse: pages.reverse() if len(pages) == 0: print(self.deviant + "'s " + mode + " had no deviations.") return 0 else: try: da_make_dirs(self.deviant + "/" + mode) if (mode == "query") or (mode == "album") or (mode == "collection"): da_make_dirs(self.deviant + "/" + mode + "/" + modeArg) except Exception as e: print(str(e)) print("Total deviations in " + self.deviant + "'s gallery found: " + str(len(pages))) ##DEPTH 2 counter2 = 0 for link in pages: counter2 += 1 if self.verbose: print("Downloading " + str(counter2) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename,filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handle_download_error(link, e) continue if self.testOnly == False: if (mode == "query") or (mode=="album") or (mode == "collection"): self.get(filelink, self.deviant + "/" + mode + "/" + modeArg + "/" + filename) else: self.get(filelink, self.deviant + "/" + mode + "/" + filename) else: print(filelink) print(self.deviant + "'s gallery successfully ripped.") def group_get(self, mode): if mode == "favs": strmode = "favby" strmode2 = "favourites" strmode3 = "favs gallery" elif mode == "gallery": strmode = "gallery" strmode2 = "gallery" strmode3 = "gallery" else: print("?") sys.exit() print("Ripping " + self.deviant + "'s " + strmode2 + "...") folders = [] insideFolder = False #are we inside a gallery folder? html = self.get('http://' + self.deviant + '.deviantart.com/' + strmode2 + '/') if re.search(strmode2 + "/\?set=.+&offset=", html, re.IGNORECASE|re.S): insideFolder = True folders = re.findall(strmode + ":.+ label=\"[^\"]*\"", html, re.IGNORECASE) #no repeats folders = list(set(folders)) i = 0 while not insideFolder: html = self.get('http://' + self.deviant + '.deviantart.com/' + strmode2 + '/?offset=' + str(i)) k = re.findall(strmode + ":" + self.deviant + "/\d+\"\ +label=\"[^\"]*\"", html, re.IGNORECASE) if k == []: break flag = False for match in k: if match in folders: flag = True else: folders+=k if self.verbose: print("Gallery page " + str(int((i/10) + 1)) + " crawled...") if flag: break i += 10 #no repeats folders = list(set(folders)) if len(folders) == 0: print(self.deviant + "'s " + strmode3 + " is empty.") return 0 else: print("Total folders in " + self.deviant + "'s " + strmode3 + " found: " + str(len(folders))) if self.reverse: folders.reverse() pat = "http:\\/\\/[a-zA-Z0-9_-]*\.deviantart\.com\\/art\\/[a-zA-Z0-9_-]*" pages = [] for folder in folders: try: folderid = re.search("[0-9]+",folder,re.IGNORECASE).group(0) label = re.search("label=\"([^\"]*)",folder,re.IGNORECASE).group(1) except: continue for i in range(0,int(Dagr.MAX_DEVIATIONS/24),24): html = self.get("http://" + self.deviant.lower() + ".deviantart.com/" + strmode2 + "/?set=" + folderid + "&offset=" + str(i - 24)) prelim = re.findall(pat, html, re.IGNORECASE) if not prelim: break for x in prelim: p = str(re.sub(r'\\/','/',x)) if p not in pages: pages.append(p) if self.verbose: print("Page " + str(int((i/24) + 1)) + " in folder " + label + " crawled...") if not self.reverse: pages.reverse() try: if mode == "favs": da_make_dirs(self.deviant + "/favs/" + label) elif mode == "gallery": da_make_dirs(self.deviant + "/" + label) except Exception as err: print(err) counter = 0 for link in pages: counter += 1 if self.verbose: print("Downloading " + str(counter) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename,filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handle_download_error(link, e) continue if self.testOnly == False: if mode == "favs": self.get(filelink, self.devianti + "/favs/" + label + "/" + filename) elif mode == "gallery": self.get(filelink, self.deviant + "/" + label + "/" + filename) else: print(filelink) print(self.deviant + "'s " + strmode3 + " successfully ripped.") def print_errors(self): if len(self.errors_count): print("Download errors count:") for error, count in self.errors_count.iteritems(): print("* " + error + " : " + str(count))
class MarketPoster: def __init__(self): self.login_url = 'http://forums.zybez.net/index.php?' \ 'app=curseauth&module=global§ion=login' self.browser = RoboBrowser(history=False, parser='html.parser') self.logged_in = False def login(self, login_name, login_password): login_name = login_name login_password = login_password self.browser.open(self.login_url) sign_in_form = self.browser.get_form(class_='authentication-box') sign_in_form['ips_username'].value = login_name sign_in_form['ips_password'].value = login_password self.browser.submit_form(sign_in_form) correct_url = 'http://forums.zybez.net/index.php' if self.browser.url == correct_url: self.logged_in = True return True else: return False def deleteItemPosts(self, post): item_url = self.getItemURL(post.item_name) self.browser.open(item_url) items_to_delete = self.browser.get_links(href=re.compile( "do=trade-delete")) for i in items_to_delete: self.browser.follow_link(i) self.browser.open(item_url) def deleteAllPosts(self): self.browser.open('http://forums.zybez.net/runescape-2007-prices') delete_button = self.browser.get_link('Remove all active offers') if delete_button is not None: self.browser.follow_link(delete_button) def getItemURL(self, item_name): item_name = item_name.split() item_name = '+'.join(item_name) item_data_url = 'http://forums.zybez.net' \ '/runescape-2007-prices/api/item/' + item_name item_data_dict = self.browser.session.get(item_data_url).json() item_id = item_data_dict['id'] item_url = 'http://forums.zybez.net/runescape-2007-prices/' \ + str(item_id) + '-' + item_name return item_url def postItem(self, post): price = post.price quantity = post.quantity note = post.note offer_type = post.offer_type contact_method = post.contact_method self.deleteItemPosts(post) post_item_form = self.browser.find( action='http://forums.zybez.net/index.php?app=priceguide&module=' 'public§ion=action&do=trade-add') post_item_form = self.browser.get_form(post_item_form) # Fill out and submit form post_item_form['type'].value = str(int(offer_type)) post_item_form['qty'].value = quantity post_item_form['price'].value = price post_item_form['notes'].value = note post_item_form['contact'].value = str(int(contact_method)) self.browser.submit_form(post_item_form)
base = "http://www.bbc.co.uk" def correctURL(url): # add the bbc website to URL if it is missing base = "http://www.bbc.co.uk" old = 'href="/' new = 'href="' + base + "/" return url.replace(old, new) # use mechanize to scrape spans from BBC news page browser = RoboBrowser(parser="html5lib") browser.open(base + "/news") link = browser.get_link(text="UK") browser.open(base + link['href']) soup = browser.parsed # pick out anchors that are tagged with the story class # tags = soup.findAll("a", "story") tags = soup.findAll("a") newSoup = BeautifulSoup(features="html5lib") for tag in tags: # add base url if it is missing from href if tag['href'][0] == "/": tag['href'] = base + tag['href'] # add tag to new soup followed by a <br> newSoup.insert(0, tag) br = soup.new_tag("br") newSoup.insert(0, br)
def login(): # login browser.open('http://www.yeeyi.com/bbs/forum.php') form = browser.get_form(id='lsform') form['fastloginfield'] = login_field # username or email form['username'].value = username.encode('GBK') # username or email form['password'].value = password browser.submit_form(form) #delay = randint(0, 600) #sleep(delay) browser.open(url) button = browser.get_link('提升帖子') if not button: login() browser.open(url) button = browser.get_link('提升帖子') browser.follow_link(button) response = browser.find(id="messagetext").find("p") with open(log_path, 'a') as log: log.write('Button clicked, at ' + ctime(mktime(localtime())) + ' ') if response is None: log.write('No response, please check') else: log.write(response.text) # print(response.text)
class Dagr: """deviantArt gallery ripper class""" NAME = basename(__file__) __version__ = "0.63" MAX_DEVIATIONS = 1000000 # max deviations def __init__(self): # Internals self.browser = None self.errors_count = dict() # Configuration self.directory = getcwd() + "/" self.mature = False self.overwrite = False self.reverse = False self.test_only = False self.verbose = False # Current status self.deviant = "" def load_configuration(self): my_conf = configparser.ConfigParser() # Try to read global then local configuration my_conf.read([ expanduser("~/.config/dagr/dagr_settings.ini"), path_join(getcwd(), "dagr_settings.ini") ]) if my_conf.has_option("DeviantArt", "MatureContent"): self.mature = my_conf.getboolean("DeviantArt", "MatureContent") if my_conf.has_option("Dagr", "OutputDirectory"): self.directory = abspath( expanduser(my_conf.get("Dagr", "OutputDirectory"))) + "/" def start(self): if not self.browser: # Set up fake browser self.set_browser() def set_browser(self): user_agents = ( 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1' ' (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/534.50' ' (KHTML, like Gecko) Version/5.1 Safari/534.50', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)', 'Opera/9.99 (Windows NT 5.1; U; pl) Presto/9.9.9', 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US)' ' AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2' ' (KHTML, like Gecko) Chrome/6.0', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; pl; rv:1.9.1)' ' Gecko/20090624 Firefox/3.5 (.NET CLR 3.5.30729)') session = req_session() session.headers.update({'Referer': 'https://www.deviantart.com/'}) if self.mature: session.cookies.update({'agegate_state': '1'}) # Try to use lxml parser if available # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser try: __import__("lxml") parser = "lxml" except ImportError: parser = "html.parser" self.browser = RoboBrowser(history=False, session=session, tries=3, user_agent=choice(user_agents), parser=parser) def get(self, url, file_name=None): if (file_name and not self.overwrite and path_exists(self.directory + file_name)): print(file_name + " exists - skipping") return self.browser.open(url) if self.browser.response.status_code != req_codes.ok: raise DagrException("incorrect status code - " + str(self.browser.response.status_code)) if file_name is None: return str(self.browser.parsed) else: # Open our local file for writing local_file = open(self.directory + file_name, "wb") # Write to our local file local_file.write(self.browser.response.content) local_file.close() def find_link(self, link): filelink = None mature_error = False self.browser.open(link) # Full image link (via download link) link_text = re.compile("Download( (Image|File))?") img_link = self.browser.get_link(text=link_text) if img_link and img_link.get("href"): self.browser.follow_link(img_link) filelink = self.browser.url return (basename(filelink), filelink) if self.verbose: print("Download link not found, falling back to direct image") # Fallback 1: try meta (filtering blocked meta) filesearch = self.browser.find("meta", {"property": "og:image"}) if filesearch: filelink = filesearch['content'] if basename(filelink).startswith("noentrythumb-"): filelink = None mature_error = True if not filelink: # Fallback 2: try collect_rid, full filesearch = self.browser.find("img", { "collect_rid": True, "class": re.compile(".*full") }) if not filesearch: # Fallback 3: try collect_rid, normal filesearch = self.browser.find("img", { "collect_rid": True, "class": re.compile(".*normal") }) if filesearch: filelink = filesearch['src'] if not filelink: if mature_error: if self.mature: raise DagrException("maybe not an image") else: raise DagrException("maybe a mature deviation/" + "not an image") else: raise DagrException("all attemps to find a link failed") filename = basename(filelink) return (filename, filelink) def handle_download_error(self, link, link_error): error_string = str(link_error) print("Download error (" + link + ") : " + error_string) if error_string in self.errors_count: self.errors_count[error_string] += 1 else: self.errors_count[error_string] = 1 def deviant_get(self, mode): print("Ripping " + self.deviant + "'s " + mode + "...") pat = r"https://[a-zA-Z0-9_-]*\.deviantart\.com/art/[a-zA-Z0-9_-]*" mode_arg = '_' if mode.find(':') != -1: mode = mode.split(':', 1) mode_arg = mode[1] mode = mode[0] # DEPTH 1 pages = [] for i in range(0, int(Dagr.MAX_DEVIATIONS / 24), 24): html = "" url = "https://" + self.deviant.lower() + ".deviantart.com/" if mode == "favs": url += "favourites/?catpath=/&offset=" + str(i) elif mode == "collection": url += "favourites/" + mode_arg + "?offset=" + str(i) elif mode == "scraps": url += "gallery/?catpath=scraps&offset=" + str(i) elif mode == "gallery": url += "gallery/?catpath=/&offset=" + str(i) elif mode == "album": url += "gallery/" + mode_arg + "?offset=" + str(i) elif mode == "query": url += "gallery/?q=" + mode_arg + "&offset=" + str(i) else: continue try: html = self.get(url) except DagrException: print("Could not find " + self.deviant + "'s " + mode) return prelim = re.findall(pat, html, re.IGNORECASE | re.DOTALL) c = len(prelim) for match in prelim: if match in pages: c -= 1 else: pages.append(match) done = re.findall( "(This section has no deviations yet!|" "This collection has no items yet!)", html, re.IGNORECASE | re.S) if len(done) >= 1 or c <= 0: break print(self.deviant + "'s " + mode + " page " + str(int((i / 24) + 1)) + " crawled...") if not self.reverse: pages.reverse() if not pages: print(self.deviant + "'s " + mode + " had no deviations.") return else: try: da_make_dirs(self.directory + self.deviant + "/" + mode) if mode in ["query", "album", "collection"]: da_make_dirs(self.directory + self.deviant + "/" + mode + "/" + mode_arg) except OSError as mkdir_error: print(str(mkdir_error)) return print("Total deviations in " + self.deviant + "'s " + mode + " found: " + str(len(pages))) # DEPTH 2 counter2 = 0 for link in pages: counter2 += 1 if self.verbose: print("Downloading " + str(counter2) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename, filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except DagrException as link_error: self.handle_download_error(link, link_error) continue if not self.test_only: try: if mode in ["query", "album", "collection"]: self.get( filelink, self.deviant + "/" + mode + "/" + mode_arg + "/" + filename) else: self.get(filelink, self.deviant + "/" + mode + "/" + filename) except DagrException as get_error: self.handle_download_error(link, get_error) continue else: print(filelink) print(self.deviant + "'s " + mode + " successfully ripped.") def group_get(self, mode): if mode == "favs": strmode = "favby" strmode2 = "favourites" strmode3 = "favs gallery" elif mode == "gallery": strmode = "gallery" strmode2 = "gallery" strmode3 = "gallery" else: print("?") sys.exit() print("Ripping " + self.deviant + "'s " + strmode2 + "...") folders = [] inside_folder = False # are we inside a gallery folder? html = self.get('https://' + self.deviant + '.deviantart.com/' + strmode2 + '/') if re.search(strmode2 + r"/\?set=.+&offset=", html, re.IGNORECASE | re.S): inside_folder = True folders = re.findall(strmode + ":.+ label=\"[^\"]*\"", html, re.IGNORECASE) # no repeats folders = list(set(folders)) i = 0 while not inside_folder: html = self.get('https://' + self.deviant + '.deviantart.com/' + strmode2 + '/?offset=' + str(i)) k = re.findall( strmode + ":" + self.deviant + r"/\d+\"\ +label=\"[^\"]*\"", html, re.IGNORECASE) if k == []: break flag = False for match in k: if match in folders: flag = True else: folders += k if self.verbose: print("Gallery page " + str(int((i / 10) + 1)) + " crawled...") if flag: break i += 10 # no repeats folders = list(set(folders)) if not folders: print(self.deviant + "'s " + strmode3 + " is empty.") return else: print("Total folders in " + self.deviant + "'s " + strmode3 + " found: " + str(len(folders))) if self.reverse: folders.reverse() pat = (r"https:\\/\\/[a-zA-Z0-9_-]*\.deviantart\.com" r"\\/art\\/[a-zA-Z0-9_-]*") pages = [] for folder in folders: folderid = re.search("[0-9]+", folder, re.IGNORECASE).group(0) label = re.search("label=\"([^\"]*)", folder, re.IGNORECASE).group(1) for i in range(0, int(Dagr.MAX_DEVIATIONS / 24), 24): html = self.get("https://" + self.deviant.lower() + ".deviantart.com/" + strmode2 + "/?set=" + folderid + "&offset=" + str(i - 24)) prelim = re.findall(pat, html, re.IGNORECASE) if not prelim: break for x in prelim: p = str(re.sub(r'\\/', '/', x)) if p not in pages: pages.append(p) if self.verbose: print("Page " + str(int((i / 24) + 1)) + " in folder " + label + " crawled...") if not self.reverse: pages.reverse() try: if mode == "favs": da_make_dirs(self.directory + self.deviant + "/favs/" + label) elif mode == "gallery": da_make_dirs(self.directory + self.deviant + "/" + label) except OSError as mkdir_error: print(str(mkdir_error)) counter = 0 for link in pages: counter += 1 if self.verbose: print("Downloading " + str(counter) + " of " + str(len(pages)) + " ( " + link + " )") filename = "" filelink = "" try: filename, filelink = self.find_link(link) except (KeyboardInterrupt, SystemExit): raise except Exception as link_error: self.handle_download_error(link, link_error) continue if not self.test_only: try: if mode == "favs": self.get( filelink, self.deviant + "/favs/" + label + "/" + filename) elif mode == "gallery": self.get( filelink, self.deviant + "/" + label + "/" + filename) except DagrException as get_error: self.handle_download_error(link, get_error) continue else: print(filelink) print(self.deviant + "'s " + strmode3 + " successfully ripped.") def print_errors(self): if self.errors_count: print("Download errors count:") for error in self.errors_count: print("* " + error + " : " + str(self.errors_count[error]))
#!/usr/bin/python ## downloader.py from robobrowser import RoboBrowser tubeBase = "http://www.tubeoffline.com/download-1channel-videos.php" sourceBase = "http://www.primewire.ag/" sourceArgs = "tv-1386995-Game-of-Thrones/season-3-episode-7" browser = RoboBrowser() browser.open(tubeBase) form = browser.get_form(id='formStyle') form['video'].value = sourceBase+sourceArgs browser.submit_form(form) link = browser.get_link(id='generateLink') print browser.find(src="http://javaplugin.org/WL/grp1/gkpluginsAPI.js") print browser.parsed # browser.follow_link('generateLink') # browser.follow_link('HERE')
from robobrowser import RoboBrowser # Browse to Genius browser = RoboBrowser(history=True) browser.open('http://www.genius.com') # Search for Porcupine Tree #form = browser.get_form(action='/search') #form = browser.get_form(class_='global_search global_search--giant') #form = browser.get_forms()[0] print form form['q'].value = 'porcupine tree' response = browser.submit_form(form) print respo # Look up the first song songs = browser.select('.song_link') browser.follow_link(songs[0]) lyrics = browser.select('.lyrics') lyrics[0].text # Back to results page browser.back() # Look up my favorite song song_link = browser.get_link('trains') browser.follow_link(song_link) # Can also search HTML using regex patterns lyrics = browser.find(class_=re.compile(r'\blyrics\b')) lyrics.text
weather_url.format(CITY="Bengaluru", APIKEY=api_key) q = weather_url.format(CITY="Bengaluru", APIKEY=api_key) r = requests.get(q) r r.ok r.json() from robobrowser import RoboBrowser br = RoboBrowser(parser="lxml") br br.open("http://pypi.org/") br.response br.response.ok br.url br.get_links() br.get_link("Register) br.get_link("Register") br.get_link(text="Register") br.get_link() get_ipython().run_line_magic('pinfo', 'br.get_link') br.get_link(text="Register") br.get_link(text_re="Register") br.get_forms() br.get_form() f = br.get_form() f["q"] f["q"] = "xml" br.submit_form(f) br.url br.response.ok br.find("div") br.open("http://www.chandrashekar.info/")
from bs4 import BeautifulSoup, Tag base = "http://www.bbc.co.uk" def correctURL(url): # add the bbc website to URL if it is missing base = "http://www.bbc.co.uk" old = 'href="/' new = 'href="' + base + "/" return url.replace(old, new) # use mechanize to scrape spans from BBC news page browser = RoboBrowser(parser="html5lib") browser.open(base + "/news") link = browser.get_link(text="UK") browser.open(base + link['href']) soup = browser.parsed # pick out anchors that are tagged with the story class # tags = soup.findAll("a", "story") tags = soup.findAll("a") newSoup = BeautifulSoup(features="html5lib") for tag in tags: # add base url if it is missing from href if tag['href'][0] == "/": tag['href'] = base + tag['href'] # add tag to new soup followed by a <br> newSoup.insert(0, tag) br = soup.new_tag("br") newSoup.insert(0, br)