def redirect(self): try: if self.agent == True: br = Browser() UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0" header = {"User-Agent": UserAgent} br.set_handle_robots(False) br.addheaders = [("User-agent", "Fifefox")] remote_url = br.open(self.target).geturl() else: remote_url = u.urlopen(self.target).geturl() return (remote_url) except Exception as e: print(e)
def card_dates(tipo, numero): cod = [] cod.append(tipo) logging.info('Buscando datos de la tarjeta CRTM '+tipo+' '+numero) browser = Browser() response = browser.open('http://tarjetatransportepublico.crtm.es/CRTM-ABONOS/consultaSaldo.aspx') if response.code == 200: # Comprueba que la página responde browser.select_form("aspnetForm") # Selecciona el formulario browser["ctl00$cntPh$txtNumTTP"] = numero # Rellena los datos de la tarjeta browser["ctl00$cntPh$dpdCodigoTTP"] = cod page = browser.submit(name="ctl00$cntPh$btnConsultar") # "Pulsa" el boton de Continuar soup = BeautifulSoup(page.read()) card_no = soup.findAll('span', attrs={"id" : 'ctl00_cntPh_lblNumeroTarjeta'}) # Busca el número completo de la tarjeta results = soup.findAll('div', attrs={"id" : 'ctl00_cntPh_tableResultados'}) # Busca los datos de validez de la tarjeta spans = results[0].findAll('span') # Separa los datos if card_no: # Comprueba si la tarjeta introducida ha devuelto un resultado válido card["card_no"] = card_no[0].renderContents() card["subscription_age"] = spans[1].text card["load_date"] = dparser.parse(spans[2].text, fuzzy=True, dayfirst=True) card["valid_date"] = dparser.parse(spans[3].text, fuzzy=True, dayfirst=True) card["first_use"] = dparser.parse(spans[4].text, fuzzy=True, dayfirst=True) card["renovation_date"] = dparser.parse(spans[5].text, fuzzy=True, dayfirst=True) else: card["card_no"] = '0' card["subscription_age"] = 'Nope' card["load_date"] = datetime.datetime(1991, 1, 1) card["valid_date"] = datetime.datetime(1991, 1, 1) card["first_use"] = datetime.datetime(1991, 1, 1) card["renovation_date"] = datetime.datetime(1991, 1, 1) return card else: logging.info("La página del CRTM no está disponible") card["card_no"] = '0' card["subscription_age"] = 'Nope' card["load_date"] = datetime.datetime(1991, 1, 1) card["valid_date"] = datetime.datetime(1991, 1, 1) card["first_use"] = datetime.datetime(1991, 1, 1) card["renovation_date"] = datetime.datetime(1991, 1, 1) return card
def scrape(url): global data, data2, data3 mech = Browser() page = mech.open(url) html = page.read() soup = BeautifulSoup(html) table = soup.find('table', width='100%', cellspacing='0', cellpadding='2', border='0') data2 = [] rowIndex = 0 for row in table.findAll('tr')[1:]: data = str(row.getText(separator=' ')) data.strip() data = data.replace(' ', '') data = data.replace('\n', '') data = data.replace('-', '0') data = data.replace(',', '') data = data.replace('\'', '') data = shlex.split(data) rowIndex += 1 for i in data[:]: if i.isdigit(): i = int(i) data2.append(i) elif contains(i, '()'): i = i.replace('(', '') i = i.replace(')', '') i = int(i) i = -i data2.append(i) else: data.remove(i) data3 = split_list(data2, 3) if url == str(income): create_income_vars() elif url == str(balance): create_balance_vars() else: create_cash_vars()
def confirm_master_runing(url): max_retries = 1000 # maximum number of times to retry interval = 3 # number of seconds to wait between retries br = Browser() br.set_handle_robots(False) tried = 0 connected = False count = 1 # count forms found in url while not connected: try: response = br.open(url) connected = True # if line above fails, this is never executed except: print "connection could not be establish" time.sleep(interval) tried += 1 if tried > max_retries: exit()
def update(acct, slp, if_update=True): br = Browser() br.set_handle_robots(False) user, pwd, dockey = acct #login(br, 'https://www.dice.com/dashboard/logout', user,pwd) login(br, 'https://www.dice.com/dashboard/login', user, pwd) (sfom, sto) = slp print 'Processing', dockey, slp #nots="http://www.dice.com/profman/servlet/ProfMan?op=1011&MENU_PROFILES=cb7bb7bace4884843e70679a3d15525e&MENU_DEACITIVATE=Make%20Not%20Searchable&makeNotSearchable"; menud = 'Make%20Not%20Searchable' browse( br, "http://www.dice.com/profman/servlet/ProfMan?op=1011&MENU_PROFILES=%s&MENU_DEACITIVATE=%s&makeNotSearchable" % (dockey, menud)) sl = random.randrange(10, 15) print "Hide break: %d sec." % sl time.sleep(sl) #browse(br,"https://secure.dice.com/regman/profile.html?dockey=fe637379adaa31c7020ffe731f90cc36") #time.sleep(random.randrange(10, 50)) if if_update: update_resume(dockey) sl = random.randrange(30, 60) print "Update break: %d sec." % sl time.sleep(sl) browse( br, "https://secure.dice.com/regman/profile.html?dockey=%s" % dockey) time.sleep(random.randrange(5, 15)) menus = 'Make%20Searchable' unhide_url = "http://www.dice.com/profman/servlet/ProfMan?op=1011&MENU_PROFILES=%s&MENU_STATUS_CHANGE=%s" % ( dockey, menus) #print unhide_url #e(0) browse(br, unhide_url) sl = random.randrange(sfom, sto) print "Unhide break: %d sec (%s min)." % (sl, sl / 60) time.sleep(sl) #browse(br,"https://secure.dice.com/regman/profile.html?dockey=%s" % dockey) #browse(br,"https://secure.dice.com/regman/profile.html?dockey=%s" % dockey) #browse(br,"https://www.dice.com/dashboard#/profiles/active" ) time.sleep(random.randrange(50, 150)) browse(br, 'https://www.dice.com/dashboard/logout')
def test_lost_your_password_for_internal_accounts(self): """websession - sending lost password for internal admin account""" try_with_account = CFG_SITE_ADMIN_EMAIL # click on "send lost password" for CFG_SITE_ADMIN_EMAIL internal account browser = Browser() browser.open(CFG_SITE_SECURE_URL + "/youraccount/lost") browser.select_form(nr=0) browser['p_email'] = try_with_account try: browser.submit() except Exception, e: # Restore the admin password (send_email set it to random number) run_sql("UPDATE user SET password=AES_ENCRYPT(email, '')" "WHERE id=1") self.fail( "Obtained %s: probably the email server is not installed " "correctly." % e)
def amazon(email): brows = Browser() brows.set_handle_robots(False) brows._factory.is_html = True brows.set_cookiejar(cookielib.LWPCookieJar()) brows.addheaders = [('User-agent',random.choice(ua["browsers"]["chrome"]))] brows.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),max_time=1) url = "https://www.amazon.com/ap/signin?openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.amazon.com%2F%3F_encoding%3DUTF8%26ref_%3Dnav_ya_signin&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.assoc_handle=usflex&openid.mode=checkid_setup&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&" brows.open(url, timeout=10) brows.select_form(nr=0) brows.form['email'] = email brows.method = "POST" submit = brows.submit() soup = BeautifulSoup(submit.read().decode("utf-8"),"lxml") if soup.find("div", {"id": "auth-password-missing-alert"}): return({"rateLimit":False,"exists":True,"emailrecovery":None,"phoneNumber":None,"others":None}) else: return({"rateLimit":False,"exists":False,"emailrecovery":None,"phoneNumber":None,"others":None})
def force_build(self, username="******", comments=None): def predicate(form): try: return form.find_control("username") except Exception: return False if not self._browser: self._browser = Browser() self._browser.set_handle_robots( False) # The builder pages are excluded by robots.txt # ignore false positives for missing Browser methods - pylint: disable=E1102 self._browser.open(self.url()) self._browser.select_form(predicate=predicate) self._browser["username"] = username if comments: self._browser["comments"] = comments return self._browser.submit()
def test_clone_browser(self): from mechanize import Browser br = Browser() br.set_handle_refresh(True, max_time=237, honor_time=True) br.set_handle_robots(False) cbr = copy.copy(br) for h, ch in zip(br.handlers, cbr.handlers): self.assertIsNot(h, ch) self.assertIs(ch.parent, cbr) self.assertIs(h.__class__, ch.__class__) self.assertEqual(set(br._ua_handlers), set(cbr._ua_handlers)) self.assertIs(br._ua_handlers['_cookies'].cookiejar, cbr._ua_handlers['_cookies'].cookiejar) self.assertIsNot(br.addheaders, cbr.addheaders) self.assertEqual(br.addheaders, cbr.addheaders) self.assertIs(br.finalize_request_headers, cbr.finalize_request_headers) h = cbr._ua_handlers['_refresh'] self.assertEqual((h.honor_time, h.max_time), (True, 237))
def getGamesAndOddsFin(url): br = Browser() #site demands a user-agent that isn't a robot br.addheaders = [('User-agent', 'Firefox')] #retrieve veikkaus vakio mobile home page and browse to game percent page #and store the page br.open(url) for link in br.links(): siteMatch = re.compile(VEIKKAUS_LINK_TEXT).search(link.text) if siteMatch: resp = br.follow_link(link) result = resp.get_data() break #html page as a beautifulsoup object page = BeautifulSoup(result) #find games names and odds gameData = page.findAll('td') #full names nameList = [] #concatenated names used to select the correct names from betfair data refNamesFin = [] #full name + odds oddsList = [] #parse game names (first 13 games) into list #4 elements per row and 13 rows => 52 for idx in range(ELEMENT_NUM): if idx % 4 == 0: gameName = gameData[idx].string.strip().encode(ENCODING) #full team names nameList.append(gameName) teamNames = gameName.split(' - ') gameString = createTeamName(teamNames[0]) + ' - ' + createTeamName( teamNames[1]) #concatenated team names refNamesFin.append(gameString) else: gameOdds = gameData[idx].string.strip().encode(ENCODING) #calculate odds from game percents and limit decimal places oddsList.append('{0:.2f}'.format(100 / float(gameOdds))) #create odds tuples, 3 odds per game (1,x,2) gameOdds = zip(*[iter(oddsList)] * 3) #create games dictionary including game names and corresponding odds return (nameList, gameOdds, refNamesFin)
def get_data(movie): try: br = Browser() br.open("http://www.imdb.com/find?s=tt&q=" + movie) link = list(br.links(url_regex=re.compile(r"/title/tt*")))[0] except: print "Not Found!" exit(3) res = br.follow_link(link) soup = BeautifulSoup(res.read()) title_year = soup.find('span', id='titleYear') year_str = str(title_year) year = re.search('.*([0-9]{4}).*', year_str).group(1) title = soup.find('title').contents[0] rate = soup.find('span', itemprop='ratingValue') rating = str(rate.contents[0]) actors = [] actors_soup = soup.findAll('span', itemprop='actors') for i in actors_soup: i_str = str(i) j = i_str.rpartition('itemprop="name"')[-1] actors.append(re.search('\>(.*?)\<', j).group(1)) directors = [] director_soup = soup.findAll('span', itemprop='director') for i in director_soup: i_str = str(i) j = i_str.rpartition('itemprop="name"')[-1] directors.append(re.search('\>(.*?)\<', j).group(1)) votes = soup.find('span', itemprop='ratingCount').contents[0] response = [] response.append({"Movie : ": title}) response.append({"Rating: ": rating}) response.append({"Votes ": votes}) response.append({"Release Year : ": year}) response.append({"Director : ": directors}) response.append({"Actors : ": actors}) return json.dumps(response)
def __init__(self, num, keyword): self.num = num self.keyword = keyword self.br = Browser(factory=mechanize.RobustFactory()) self.br.set_handle_robots(False) self.br.addheaders = [ ('User-Agent', userAgent), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') ] self.cj = mechanize.LWPCookieJar() self.br.set_cookiejar(self.cj) self.br._factory.is_html = True self.br.set_handle_refresh(False) self.idletime = 0 threading.Thread.__init__(self) self.url = "" self.depth = 0 self.output = ""
def upload(count): br = Browser() br.set_handle_robots(False) br.open('http://zincpharmer.csb.pitt.edu/pharmville/') form = list(br.forms())[0] br.form = form form['receptor'] = ['traf2'] form.add_file(open(outputBase + 'minimized_results.sdf'), 'text/plain', 'upload.sdf') form['userid'] = 'yifengt' form['name'] = 'Test' response = br.submit() print str(count) + '.sdf' analysis = process() analysis.feed(response.read()) analysis.close() br.close()
def movie(request, uId): if 'q' in request.GET: movie = request.GET['q'] movie_search = '+'.join(movie.split()) base_url = 'http://www.imdb.com/find?q=' url = base_url + movie_search + '&s=all' title_search = re.compile('/title/ttd+') br = Browser() br.open(url) link = br.find_link(url_regex=re.compile(r'/title/tt.*')) res = br.follow_link(link) soup = BeautifulSoup(res.read()) print soup info = {} movie_title = getunicode(soup.find('title')) info['title'] = movie_title strng = "" rate = soup.find('span', itemprop='ratingValue') rating = getunicode(rate) info['rating'] = rating img = soup.find('img', {'itemprop': 'image'})['src'] image = getunicode(img) #image = image.split('.jpg')[0] info[ 'img'] = "https://d202m5krfqbpi5.cloudfront.net/books/1370898422l/18054175.jpg" des = soup.find('meta', {'name': 'description'})['content'] descp = getunicode(des) info['description'] = descp genre = [] infobar = soup.find('div', {'class': 'infobar'}) r = infobar.find('', {'title': True})['title'] genrelist = infobar.findAll('a', {'href': True}) for i in range(len(genrelist) - 1): genre.append(getunicode(genrelist[i])) info['genre'] = genre release_date = getunicode(genrelist[-1]) info['date'] = release_date return render_to_response('moviedetails.html', {'Movie': info}, context_instance=RequestContext(request)) else: return HttpResponseRedirect('/%s/?e=1' % uId) #set url to /userid
def getQuote(movie_search, quote_prefix): base_url = 'http://www.imdb.com/find?q=' url = base_url + movie_search + '&s=all' title_search = re.compile('/title/tt\d+') br = Browser() br.open(url) link = br.find_link( url_regex=re.compile(r'/title/tt(.*)/?ref_=fn_al_tt_1')) res = br.follow_link(link) soup = BeautifulSoup(res.read(), 'html.parser') qtlink = br.find_link(url='trivia?tab=qt&ref_=tt_trv_qu') qtres = br.follow_link(qtlink) qtsoup = BeautifulSoup(qtres.read(), 'html.parser') searchFor = quote_prefix all_chars = [] quote_entry = {} char_match = qtsoup.find_all("span", class_=re.compile("character")) for c in char_match: if c.string not in all_chars: all_chars.append(c.string) tag = qtsoup.p tag.a.decompose() ch_match = [] for tag in qtsoup.find_all(re.compile('p')): if tag.name == 'p': ch_match = tag.find_all(text=re.compile(":(.*)")) for allqt in ch_match: if searchFor in allqt: print allqt
def add_to_cart(): """ """ data = request.json resp = Response(u'%s') uid = data["uid"] pwd = data["pwd"] items = data["items"] params = {"items": items} params = urllib.quote(json.dumps(params)).replace('%27', '%22').replace( '%20', '') #print params br = Browser() br.addheaders = [('User-agent', 'Firefox')] br.set_handle_robots(False) br.open('https://www.freshdirect.com/login/login.jsp') br.select_form(name="fd_login") br['userid'] = uid br['password'] = pwd br.submit() #items = data[ "items" ] br.addheaders = [('Content-Type', 'application/x-www-form-urlencoded'), ('User-agent', 'Firefox')] # encoding the dict is producing single quote (%27) when it should do double (%22) #br.open( "https://www.freshdirect.com/api/addtocart", "data=%7B%22items%22%3A%5B%7B%22salesUnit%22%3A%22EA%22%2C%22quantity%22%3A%223%22%2C%22skuCode%22%3A%22FRU0069115%22%2C%22pageType%22%3A%22BROWSE%22%7D%5D%7D") br.open("https://www.freshdirect.com/api/addtocart", "data=" + params) soup = BS(br.response().read(), "lxml") print soup #resp = Response(u'%s' % json_output) resp.headers['Content-Type'] = 'application/json; charset=utf-8' return resp
def get_ratings(movies_of_my_genre): for movie in movies_of_my_genre: try: print "Checking IMDb rating of : " + movie.movie_name.replace( '\t', '') movie_search = '+'.join(movie.movie_name.split()) movie_url = base_url + movie_search + '&s=all' br = Browser() br.open(movie_url) link = br.find_link(url_regex=re.compile(r'/title/tt.*')) res = br.follow_link(link) soup = BeautifulSoup(res.read(), "lxml") movie_title = soup.find('title').contents[0] rate = soup.find('span', itemprop='ratingValue') if rate is not None: movie.movie_rating = float(rate.contents[0]) else: movie.movie_rating = 0 except: movie.movie_rating = 0
def token(timestamp, username, t_id, t_short, submission_num, base_url): """Execute the request for releasing test a submission. timestamp (int): seconds from the start. username (string): username issuing the submission. t_id (string): id of the task. t_short (string): short name of the task. submission_num (string): id of the submission to release test. base_url (string): http address of CWS. """ print("\n%s - Playing token for %s on task %s" % (to_time(timestamp), username, t_short), end='') browser = Browser() browser.set_handle_robots(False) LoginRequest(browser, username, "", base_url=base_url).execute() TokenRequest(browser, (int(t_id), t_short), submission_num=submission_num, base_url=base_url).execute()
def create_browsing_context(): if not os.path.exists('corpas.focloir.ie.credentials'): # untried username = input('corpas.focloir.ie username:'******'corpas.focloir.ie password:'******'corpas.focloir.ie.credentials', 'rw') as f: f.write(username + '\n' + password) with open('corpas.focloir.ie.credentials', 'r') as f: username, password = f.read().split('\n') br = Browser() br.set_handle_robots(False) #br.open('http://corpas.focloir.ie/') br.open('http://focloir.sketchengine.co.uk/') br.add_password( 'http://focloir.sketchengine.co.uk/auth/run.cgi/simple_search?home=1', username, password) for link in br.links(): if link.text.lower().replace(' ', '') == 'login': br.follow_link(link) return br
def search(arg): assert '/' not in arg # because we use it in a filename cache = rc['authority_cache'] filename = cache + '/' + arg if os.path.exists(filename): return [eval(i) for i in open(filename)] br = Browser() br.set_handle_robots(False) br.open(start) br.select_form(name="querybox") br['Search_Arg'] = arg.encode('utf-8') br['Search_Code'] = ['NHED_'] res = br.submit() found = list(read_serp(res)) br.close() out = open(filename, 'w') for i in found: print >> out, i out.close() return found
def browser(self): '''setup mechanize browser instance''' if not self.br: br = Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) #set user agents to maintain aws happiness br.addheaders = [('User-agent', self.user_agent), ('Referer', self.referer)] br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) #br.set_handle_gzip(True) #experimental & probably unnecessary #proxy support #br.set_proxies({"http": "user:[email protected]:3128"}) #br.set_proxies({"http": "myproxy.example.com:3128"}) #br.add_proxy_password("user", "password") self.br = br return self.br
def getCurrentCoverageDirectory(baseURL): mech = Browser() mech.open(baseURL) currentLink = None for link in mech.links(): # Find the first directory link that is not the parent if (link.url.endswith("/") and not link.url.startswith("/")): currentLink = link break if currentLink == None: mech.close() raise "Unable to find current coverage directory" linkURL = currentLink.base_url + currentLink.url mech.close() return linkURL
def main(): br = Browser() names = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ") for i in range(100): br.set_handle_robots(False) br.open("http://www.ultimatelovecalc.com/love/1352035") br.select_form(name="formMain") a1 = random.sample(names, 5) a2 = random.sample(names, 5) a3 = random.sample(names, 5) br["fname"] = "".join(a1) br["cname1"] = "".join(a2) br["cname2"] = "".join(a3) br["cname3"] = "".join(a1) sub = br.submit() donech = sub.read() print i
def ncbiUrlBuilder(snp): """ncbiUrlBuilder builds the url used by NCBI to query a particular SNP. an example url for snp rs96066708 is below: http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=9606708 and will give the information page for that SNP. It builds the URL based on the base URL from the ncbi site and concatenates it with some simple string building tools. The URL is passed to mechanize/Browser to open the site, and the opened page is returned to the program. Arguments: snp -- the snp id. """ #url = "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?rs=9606708" print 'attempting to query', snp snpNumber = snp[2:(len(snp))] url = "http://www.ncbi.nlm.nih.gov/projects/SNP/snp_ref.cgi?" + "rs=" + snpNumber print 'trying url: ' + url mech = Browser() page = mech.open(url) return page, snp
def submit(timestamp, username, t_id, t_short, files, base_url): """Execute the request for a submission. timestamp (int): seconds from the start. username (string): username issuing the submission. t_id (string): id of the task. t_short (string): short name of the task. files ([string]): list of filenames of submitted files. base_url (string): http address of CWS. """ print("\n%s - Submitting for %s on task %s" % (to_time(timestamp), username, t_short), end='') browser = Browser() browser.set_handle_robots(False) LoginRequest(browser, username, "", base_url=base_url).execute() SubmitRequest(browser, (int(t_id), t_short), filename=files[0], base_url=base_url).execute()
def __init__(self, url, filename, sample_time, n_attempts=2): self.filename = filename self.sample_time = sample_time self.n_attempts = n_attempts sleep_time = 5 self.url = url self.br = Browser() while (True): attempt = 0 for attempt in range(n_attempts): try: self._load_page() self.write_games_odds() break except: print('Erro!') time.sleep(sleep_time) time.sleep(self.sample_time - attempt * sleep_time)
def yapistir(): br = Browser() br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0' )] br.set_handle_robots(False) br.open("http://paste.ubuntu.com") br.select_form("pasteform") br['poster'] = coder br.find_control(name="syntax").value = ["python"] dosya_ac = open(dosya) kodlar = dosya_ac.read() br['content'] = kodlar br.submit() for link in br.links(): k_link.append(link.url)
def read_all_result_page_links_for(mainurl): br = Browser() br.set_handle_robots(False) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] i = 0 global_list = [] br.open(mainurl) nice_links = [l for l in br.links() if 'company' in l.url] #global_list.extend(nice_links) #record = {} for link in nice_links: i = i + 1 read_detail_page(link.url, i)
def fetch(): result_no = 0 # Number the output files br = Browser() # Create a browser br.open(LOGIN_URL) # Open the login page br.select_form(name="login") # Find the login form br['username'] = USERNAME # Set the form values br['password'] = PASSWORD resp = br.submit() # Submit the form # Automatic redirect sometimes fails, follow manually when needed if 'Redirecting' in br.title(): resp = br.follow_link(text_regex='click here') # Loop through the searches, keeping fixed query parameters for actor in in VARIABLE_QUERY: # I like to watch what's happening in the console print >> sys.stderr, '***', actor # Lets do the actual query now br.open(SEARCH_URL + FIXED_QUERY + actor) # The query actually gives us links to the content pages we like, # but there are some other links on the page that we ignore nice_links = [l for l in br.links() if 'good_path' in l.url and 'credential' in l.url] if not nice_links: # Maybe the relevant results are empty break for link in nice_links: try: response = br.follow_link(link) # More console reporting on title of followed link page print >> sys.stderr, br.title() # Increment output filenames, open and write the file result_no += 1 out = open(result_%04d' % result_no, 'w') print >> out, response.read() out.close() # Nothing ever goes perfectly, ignore if we do not get page except mechanize._response.httperror_seek_wrapper: print >> sys.stderr, "Response error (probably 404)" # Let's not hammer the site too much between fetches time.sleep(1)
def downloadAll(self, section: str, url: str, createdirs: bool, overwrite: int = 1, pattern: str = "", saveto: str = "", httpUsername: str = None, httpPassword: str = None): br = Browser() self.setupBrowser(br, url, httpUsername, httpPassword) br.open(url) for link in br.links(url_regex=pattern): if link.url.startswith("http://") or link.url.startswith( "https://"): self.download(section, link.url, createdirs, overwrite, saveto=saveto, httpUsername=httpUsername, httpPassword=httpPassword) elif link.url.startswith("/"): self.download(section, link.base_url[:link.base_url.find("/", 8)] + link.url, createdirs, overwrite, saveto=saveto, httpUsername=httpUsername, httpPassword=httpPassword) else: self.download(section, link.base_url[:link.base_url.rfind("/") + 1] + link.url, createdirs, overwrite, saveto=saveto, httpUsername=httpUsername, httpPassword=httpPassword)