def retrieve_product_data(self, product_link): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) browser = mechanize.Browser() product_data = browser.open(product_link).get_data() soup = BeautifulSoup(product_data) product_name = soup.find('title').string.encode('ascii', 'ignore') product_prices = soup.find('div', 'price').contents try: cash_price = int(clean_price_string(product_prices[4])) product_data = ProductData() product_data.custom_name = product_name product_data.price = cash_price product_data.url = product_link product_data.comparison_field = product_link return product_data except IndexError: return None
def readUrl(inUrl): tryCount = 0 while tryCount < 5: # print "Create CookieJar" cookies = mechanize.CookieJar() # print "Build Opener" opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) # print "Add Headers" opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible; MyProgram/0.1)"), ("From", "*****@*****.**")] # print "Install Opener" mechanize.install_opener(opener) try: # print "Open URL" response = mechanize.urlopen(inUrl) tryCount = 99 except: tryCount += 1 print "******** Error on urlopen ***********" print "URL: ", inUrl print "Trying Again....", tryCount # print response.read() # html = urllib.urlopen(inUrl).read() # print "Reading Response" html = response.read() # print "Response Read:", html[0:100] root = lxml.html.fromstring(html) # print "Root created: ", root return root
def retrieve_product_links(self): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) url_base = 'http://www.globalmac.cl/' browser = mechanize.Browser() url_extensions = [ ['Distribuidor-Apple-Chile/MacBook-Air', 'Notebook'], ['Distribuidor-Apple-Chile/MacBook-Pro', 'Notebook'], ['Hardware-Mac-PC/Discos-Duros-Notebook-SATA-2.5', 'StorageDrive'], ['Hardware-Mac-PC/Discos-Duros-SATA-3.5', 'StorageDrive'], ['Hardware-Mac-PC/Discos-Duros-SSD-SATA-2.5', 'StorageDrive'], ] product_links = [] for url_extension, ptype in url_extensions: url = url_base + url_extension base_data = browser.open(url).get_data() soup = BeautifulSoup(base_data) for item in soup.findAll('div', 'name'): product_links.append([item.find('a')['href'], ptype]) return product_links
def slurp_with_login_and_pwd(): import sys import mechanize # sys.path.append('ClientCookie-1.0.3') # from mechanize import ClientCookie # sys.path.append('ClientForm-0.1.17') # import ClientForm # Create special URL opener (for User-Agent) and cookieJar cookieJar = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) fp = mechanize.urlopen("http://login.yahoo.com") forms = mechanize.ParseResponse(fp) fp.close() # print forms on this page for form in forms: print "***************************" print form form = forms[0] form["login"] = "******" # use your userid form["passwd"] = "password" # use your password fp = mechanize.urlopen(form.click()) fp.close() fp = mechanize.urlopen( "https://class.coursera.org/ml-003/lecture/download.mp4?lecture_id=1" ) # use your group fp.readlines() fp.close()
def themain(): #browser=mechanize.Browser() #browser.open('http://www.baidu.com') cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener) r = mechanize.urlopen('http://www.baidu.com') cj.save('cookie.txt', ignore_discard=True, ignore_expires=True)
def resolve(self, url, cookie_jar, user_agent): headers = {'User-agent': user_agent, 'Referer': url} try: cookie_jar.load(ignore_discard=True) except Exception as e: logger.info(e) opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(cookie_jar)) request = mechanize.Request(url) for key in headers: request.add_header(key, headers[key]) try: response = opener.open(request) except mechanize.HTTPError as e: response = e body = response.read() cookie_jar.extract_cookies(response, request) cookie_helper.check_cookies(cookie_jar) parsed_url = urlparse(url) submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, parsed_url.netloc) params = {} try: params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1) params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1) js = self._extract_js(body) except mechanize.HTTPError as e: return None params["jschl_answer"] = str(js + len(parsed_url.netloc)) sParameters = urllib.urlencode(params, True) request = mechanize.Request("%s?%s" % (submit_url, sParameters)) for key in headers: request.add_header(key, headers[key]) sleep(5) try: response = opener.open(request) except mechanize.HTTPError as e: response = e return response
def __callRequest(self): if self.caching and self.cacheTime > 0: sContent = self.readCache(self.getRequestUri()) if sContent: return sContent cookieJar = mechanize.LWPCookieJar(filename=self._cookiePath) try: # TODO ohne try evtl. cookieJar.load(ignore_discard=self.__bIgnoreDiscard, ignore_expires=self.__bIgnoreExpired) except Exception as e: logger.info(e) sParameters = urllib.urlencode(self.__aParameters, True) handlers = [ mechanize.HTTPCookieProcessor(cookiejar=cookieJar), mechanize.HTTPEquivProcessor, mechanize.HTTPRefreshProcessor ] if sys.version_info >= (2, 7, 9) and sys.version_info < (2, 7, 11): handlers.append(newHTTPSHandler) opener = mechanize.build_opener(*handlers) if (len(sParameters) > 0): oRequest = mechanize.Request(self.__sUrl, sParameters) else: oRequest = mechanize.Request(self.__sUrl) for key, value in self.__headerEntries.items(): oRequest.add_header(key, value) cookieJar.add_cookie_header(oRequest) user_agent = self.__headerEntries.get('User-Agent', common.FF_USER_AGENT) try: oResponse = opener.open(oRequest, timeout=self.requestTimeout) except mechanize.HTTPError, e: if e.code == 503 and e.headers.get( "Server") == 'cloudflare-nginx' or 'cloudflare': html = e.read() oResponse = self.__check_protection(html, user_agent, cookieJar) if not oResponse: logger.error("Failed to get CF-Cookie for Url: " + self.__sUrl) return '' elif not self.ignoreErrors: xbmcgui.Dialog().ok('xStream', 'Fehler beim Abrufen der Url:', self.__sUrl, str(e)) logger.error("HTTPError " + str(e) + " Url: " + self.__sUrl) return '' else: oResponse = e
def initialize_browser(): """Configurações para contornar os cookies, robots.txt e outros para fingir ser um browser normal.""" cookiejar = cookielib.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookiejar)) mechanize.install_opener(opener) browser = mechanize.Browser() browser.set_handle_robots(False) browser.set_handle_redirect(True) browser.set_cookiejar(cookiejar) browser.set_handle_equiv(True) browser.set_handle_referer(True) browser.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=2) browser.addheaders = [('User-agent', 'Google Chrome')] return browser, cookiejar
def __init__(self): self.cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self.cj)) mechanize.install_opener(opener) self.br = mechanize.Browser() self.br.set_cookiejar(self.cj) self.sessionkey = 'None' self.br.set_header( 'User-Agent', value= 'Mozilla/5.0 (X11; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0' ) # self.br.set_debug_http(True) self.br.set_debug_redirects(True)
def get_trash_zone(address, zip): #Make cookie jar. See wwwsearch.sourceforge.dat/mechanize/hints.html cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener) #Save cookies cj.save( "/usr/local/django/recyclocity/recyclocity_static/cookies/cookie_jar", ignore_discard=True, ignore_expires=True) #Create a browser browser = mechanize.Browser() #Fill in form browser.open('http://lmt-web.lowermerion.org/cgi-bin/refuse2.plx') browser.form = list(browser.forms())[0] browser.form['askrecycle'] = address browser.form['postcode'] = zip #Submit form browser.submit() #Extract content content = browser.response().read() #Use pattern match to extract fields m = re.search('<b>(Monday|Tuesday|Wednesday|Thursday|Friday)</b>', content) if m: day, = m.groups() #Convert day to number day_number = schedule_helpers.get_day_number(day) else: #Failed return m = re.search('<b>Zone ([1-4])</b>', content) if m: zone, = m.groups() else: #Failed return #Match for both day and zone return day_number, zone
def get_trash_zone(address, zip): #Make cookie jar. See wwwsearch.sourceforge.dat/mechanize/hints.html cj = mechanize.LWPCookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener) #Create a browser browser = mechanize.Browser() #User-Agent (this is cheating, ok?) browser.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] #Save cookies cj.save( "/usr/local/django/recyclocity/recyclocity_static/cookies/cookie_jar", ignore_discard=True, ignore_expires=True) #Fill in form #browser.open('http://citymaps.phila.gov/portal/') #browser.select_form(name="form1") #browser.form['txtSearchAddress'] = address #Fill in form #browser.open('https://alpha.phila.gov/property/') #browser.open('http://www.lowermerion.org/cgi-bin/recycle2.plx/') browser.open( 'http://www.lowermerion.org/services/public-works-department/refuse-and-recycling/how-to-determine-your-recycling-collection-day' ) #browser.form = list(browser.forms())[0] #browser.form['askrecycle'] = address #browser.form['postcode'] = zip #Submit form #browser.submit() #Extract content content = browser.response().read() return content
def __init__(self, username, password): mechanize.Browser.__init__(self) cj = mechanize.LWPCookieJar() self.set_cookiejar(cj) self.set_handle_equiv(True) self.set_handle_redirect(True) self.set_handle_referer(True) self.set_handle_robots(False) self.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] self.open(self.base_url) self.username = username self.password = password self.login() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener)
def go(): ''' Main procedure of the scraper. Creates a browser, load the list of tasks and execute them ''' try: # Prepare the browser cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) mechanize.install_opener(opener) br = mechanize.Browser() br.set_handle_robots(False) br.set_handle_refresh(False) br.set_handle_referer(False) br.open("http://www.infogreffe.fr/infogreffe/process.do") # Get the list of tasks tasks = load_task_queue() if len(tasks) == 0: # If there is no task to execute, init/reset the table init_task_queue() tasks = load_task_queue() for task in tasks: try: # Execute the task results = get_companies(br, task['name'], task['dept']) # If we hit the soft limit, add more refined searches to the queue if results == 100: print "Limit reached for %s in %s, adding new tasks" % ( task['name'], task['dept']) expand_task_queue(task['name'], task['dept']) # Mark the task as done mark_task_done(task['name'], task['dept'], results) except Exception as detail: # We may get an exception for using too much CPU time. print "Exception raised", detail except Exception as detail: # If we can't open the browser, just skip running the scraper print "Failed starting browser ", detail
def resolve(self, req, error, cookieJar): sleep(5) useragent = req.headers.get('User-agent') body = error.read() parsed_url = urlparse(error.url) submit_url = "%s://%s/cdn-cgi/l/chk_jschl" % (parsed_url.scheme, parsed_url.netloc) params = {} try: params["jschl_vc"] = re.search(r'name="jschl_vc" value="(\w+)"', body).group(1) params["pass"] = re.search(r'name="pass" value="(.+?)"', body).group(1) js = self._extract_js(body) except: raise params["jschl_answer"] = str(js + len(parsed_url.netloc)) opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(cookieJar)) sParameters = urllib.urlencode(params, True) request = mechanize.Request("%s?%s" % (submit_url, sParameters)) request.add_header('Referer', error.url) request.add_header('User-agent', useragent) try: response = opener.open(request) except: raise return response, cookieJar
def logIn(self): """ Logs in to private archives using the supplied email and password. Stores the cookie so we can continue to get subsequent pages. """ cookieJar = mechanize.CookieJar() opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(cookieJar)) opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible)")] mechanize.install_opener(opener) self.message('Logging in to ' + self.list_url) fp = mechanize.urlopen(self.list_url) forms = ClientForm.ParseResponse(fp, backwards_compat=False) fp.close() form = forms[0] form['username'] = self.username form['password'] = self.password fp = mechanize.urlopen(form.click()) fp.close()
def BROWSER(self,cookie3=''): """ :param url: """ # global br, cj, r, proxy, User_Pass br = mechanize.Browser() # print br # Cookie Jar # fo=os.getcwd()+"\\cookies\\" # try : # os.mkdir(fo) # except: # pass # os.chdir(fo) # folder=sys.path.insert(0,'/cookies') if self.cookie3=='': fo = os.getcwd().replace('\\','/') # pathname = os.path.join("cookies", cookie3) site = urlparse2(self.url).hostname if not os.path.isdir(fo + "/cookies/"+site ):os.mkdir(fo + "/cookies/"+site ) chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" self.cookie3 = fo + "/cookies/"+site +'/'+''.join([random.choice(chars) for x in range(5)]) + ".txt" self.cj = cookielib.LWPCookieJar() else: self.cj = cookielib.LWPCookieJar() self.cj.revert(self.cookie3) opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(self.cj)) br.set_cookiejar(self.cj) # os.chdir(..) # Browser options br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_referer(True) # no allow everything to be written to br.set_handle_robots(False) # no robots br.set_handle_refresh(True) # can sometimes hang without this br.set_handle_redirect(True) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Want debugging messages? #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) # User-Agent (this is cheating, ok?) br.addheaders = [('User-Agent', 'Mozilla/5.0 (Linux; U; Android 2.3.4; en-us; T-Mobile myTouch 3G Slide Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Language', 'en-gb,en;q=0.5'), ('Accept-Encoding', 'gzip,deflate'), ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'), ('Keep-Alive', '115'), ('Connection', 'keep-alive'), ('Cache-Control', 'max-age=0'), ('Referer', 'http://yahoo.com')] # # If the protected site didn't receive the authentication data you would # # end up with a 410 error in your face # br.add_password('http://safe-site.domain', 'username', 'password') # br.open('http://safe-site.domain') # Open some site, let's pick a random one, the first that pops in mind: # Proxy and user/password #proxy = "61.233.25.166:80" # proxy = "202.202.0.163:3128" # proxy=self.proxy # Proxy # dd=re.findall('None:None', proxy) if self.proxy != [] and self.proxy != '' and not (re.findall('None', self.proxy)): br.proxies = br.set_proxies({"http": self.proxy}) # br.proxies=br.set_proxies( proxy) if self.User_Pass != [] and self.User_Pass != '' and not (re.findall('None:None', self.User_Pass)): br.add_proxy_password(self.User_Pass.split(":")[0], self.User_Pass.split(":")[1]) # if r!={}: # rr = br.open(url) # c= cookielib.Cookie(version=0, name='PON', value="xxx.xxx.xxx.111", expires=365, port=None, port_specified=False, domain='xxxx', domain_specified=True, domain_initial_dot=False, path='/', path_specified=True, secure=True, discard=False, comment=None, comment_url=None, rest={'HttpOnly': False}, rfc2109=False) # cj.set_cookie(c0) self.cj.save( self.cookie3) return br
def BROWSER(self): """ :param url: """ # global br, cj, r, proxy, User_Pass br = mechanize.Browser() # print br # Cookie Jar # fo=os.getcwd()+"\\cookies\\" # try : # os.mkdir(fo) # except: # pass # os.chdir(fo) # folder=sys.path.insert(0,'/cookies') chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" cookie3 = ''.join([random.choice(chars) for x in range(5)]) + ".txt" cj = cookielib.LWPCookieJar() # cj.revert(cookie3) opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) br.set_cookiejar(cj) fo = os.getcwd() # pathname = os.path.join("cookies", cookie3) cj.save(fo + "\\cookies\\" + cookie3) # os.chdir(..) # Browser options br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Want debugging messages? #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) # User-Agent (this is cheating, ok?) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] # # If the protected site didn't receive the authentication data you would # # end up with a 410 error in your face # br.add_password('http://safe-site.domain', 'username', 'password') # br.open('http://safe-site.domain') # Open some site, let's pick a random one, the first that pops in mind: # Proxy and user/password #proxy = "61.233.25.166:80" # proxy = "202.202.0.163:3128" # proxy=self.proxy # Proxy # dd=re.findall('None:None', proxy) if self.proxy != [] and self.proxy != '' and not (re.findall( 'None', self.proxy)): br.proxies = br.set_proxies({"http": self.proxy}) # br.proxies=br.set_proxies( proxy) if self.User_Pass != [] and self.User_Pass != '' and not (re.findall( 'None:None', self.User_Pass)): br.add_proxy_password( self.User_Pass.split(":")[0], self.User_Pass.split(":")[1]) # if r!={}: # rr = br.open(url) return br
except: oResponse.set_data(gzipper.extrabuf) if self.__aResponses: forms = mechanize.ParseResponse(oResponse, backwards_compat=False) form = forms[self.__formIndex] for field in self.__aResponses: #logger.info("Field: " + str(not field in form)) try: form.find_control(name=field) except: form.new_control("text", field, {"value": ""}) form.fixup() form[field] = self.__aResponses[field] o = mechanize.build_opener( mechanize.HTTPCookieProcessor(cookieJar)) oResponse = o.open(form.click(), timeout=self.requestTimeout) sContent = oResponse.read() checked_response = self.__check_protection(sContent, user_agent, cookieJar) if checked_response: oResponse = checked_response sContent = oResponse.read() cookie_helper.check_cookies(cookieJar) cookieJar.save(ignore_discard=self.__bIgnoreDiscard, ignore_expires=self.__bIgnoreExpired) if (self.__bRemoveNewLines == True):
def __init__(self, ip): self.ip = ip self.neighbours = {} self.laser_ports = {} self.new_adm = False #print(self.ip, type(self.ip)) self.baseurl = 'http://%s:20080/' % (self.ip) try: br = mechanize.Browser() #Create mechanize browser object #Added false headers try: cookies = mechanize.CookieJar() opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [( "User-agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" )] mechanize.install_opener(opener) except Exception as e: print(str(e)) try: if 'TJ1400' in br.open(self.baseurl, timeout=5.0).read(): self.new_adm = True br.form = list(br.forms())[0] controls = list(br.form.controls) controls[0].value = 'tejas' controls[1].value = 'j72e#05t' page = br.submit() self.new_adm = True time.sleep(5) page = br.open(self.baseurl, timeout=5.0).read() except Exception as e: #print("{}-{}".format(str(e), self.ip)) br = mechanize.Browser() br.add_password( self.baseurl, username, passw ) #Get user id and password from command line arguements page = br.open(self.baseurl, timeout=5.0).read() #Check if NE is accessible self.new_adm = False if 'alarmBanner' in page: print "Logged in to %s" % (self.baseurl) loggedIn = self.get_laser_data(br) #Read laser data of STM ports failTime = threading.Thread(target=self.get_fail_time, args=(br, )) failTime.start() #self.get_fail_time(br) #Read alarams (MS DCC Fail only) addNeighbours = threading.Thread(target=self.add_neighbours, args=(br, )) addNeighbours.start() #self.add_neighbours(br) #Add neighbours if loggedIn: self.backup(br) #Backup cross-connect info failTime.join() addNeighbours.join() #print(self.neighbours) if self.alarams_dict: for stm in self.alarams_dict.keys(): if stm in self.neighbours.keys(): fail_node_times = [[ self.ip, self.neighbours[stm][0], self.alarams_dict[stm] ]] fail_times.extend(fail_node_times) except Exception as e: print("\nError reading {} \n-+--+- {} -+--+-".format(ip, str(e))) br.close() return (None)
from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.selector import HtmlXPathSelector from scrapy.item import Item, Field from scrapy.contrib.loader import XPathItemLoader from scrapy.contrib.loader.processor import MapCompose, TakeFirst from scrapy.conf import settings from scrapy import log from scrapy.xlib.pydispatch import dispatcher from scrapy import signals import mechanize cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [("User-agent", 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/534.56.5 (KHTML, like Gecko) Version/5.1.6 Safari/534.56.5'), ("From", "*****@*****.**")] class MoviefoneSpider(CrawlSpider): name = 'moviefone' allowed_domains = [ 'www.moviefone.com' ] start_urls = [ 'http://www.moviefone.com/dvd/' ] rules = ( ## Pitchfork - Top 50 - 2011 Rule(SgmlLinkExtractor(allow=('/dvd/?page'))), Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="hub-body"]/div[43]/div[2]/div[1]/a')), follow=True), Rule(SgmlLinkExtractor(restrict_xpaths=('//div[@class="hub-body"]/div/div/a[@class="movieTitle"]')), callback='parseItem', follow=True) )
def login_to_site(url, form_data, proxy=[], User_Pass=[]): username = "******" % form_data password = "******" % form_data user_tag = "%(user_tag)s" % form_data pass_tag = "%(pass_tag)s" % form_data Form_id = "%(Form_id)s" % form_data log_done = "%(Log_test)s" % form_data br = mechanize.Browser(factory=mechanize.RobustFactory()) # Browser options br.set_handle_robots(False) br.set_handle_referer(True) br.set_handle_refresh(True) br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" cookie3 = ''.join([random.choice(chars) for x in range(5)]) + ".txt" cj = cookielib.LWPCookieJar() # cj.revert(cookie3) opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) br.set_cookiejar(cj) try: fo = os.getcwd() os.chdir(fo) os.mkdir(fo + "\\cookies\\") except: pass pathname = os.path.join("cookies", cookie3) cj.save(pathname) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Want debugging messages? # User-Agent (this is cheating, ok?) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] # txheaders = { # 'Accept':'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', # 'Accept-Language':'en,hu;q=0.8,en-us;q=0.5,hu-hu;q=0.3', # 'Accept-Encoding': 'gzip, deflate', # 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', # 'Keep-Alive': '300', # 'Connection': 'keep-alive', # 'Cache-Control': 'max-age=0', # } # # req = urllib2.Request(url, txheaders) # req2 = urllib2.urlopen(req) # print req2 if proxy != [] and not (re.findall('None:None', proxy)): br.proxies = br.set_proxies({"http": proxy}) # br.proxies=br.set_proxies( proxy) if User_Pass != [] and not (re.findall('None:None', User_Pass)): br.add_proxy_password(User_Pass.split(":")[0], User_Pass.split(":")[1]) try: br.open(url) except urllib2.HTTPError, e: print "Got error code", e.code try: br.open(url) except urllib2.HTTPError, e: print "Got error code", e.code
def resolve(self, url, cookie_jar, user_agent): headers = {'User-agent': user_agent, 'Referer': url} try: cookie_jar.load(ignore_discard=True) except Exception as e: logger.info(e) opener = mechanize.build_opener( mechanize.HTTPCookieProcessor(cookie_jar)) request = mechanize.Request(url) for key in headers: request.add_header(key, headers[key]) try: response = opener.open(request) except mechanize.HTTPError as e: response = e body = response.read() cookie_jar.extract_cookies(response, request) cookie_helper.check_cookies(cookie_jar) pattern = 'xhr\.open\("GET","([^,]+),' match = cParser.parse(body, pattern) if not match[0]: return urlParts = match[1][0].split('"') parsed_url = urlparse(url) sid = '1200' script_url = '%s://%s%s%s%s' % (parsed_url.scheme, parsed_url.netloc, urlParts[0], sid, urlParts[2]) request = mechanize.Request(script_url) for key in headers: request.add_header(key, headers[key]) try: response = opener.open(request) except mechanize.HTTPError as e: response = e body = response.read() cookie_jar.extract_cookies(response, request) cookie_helper.check_cookies(cookie_jar) if not self.checkBFCookie(body): return body # even if its false its probably not the right content, we'll see cookie = self.getCookieString(body) if not cookie: return name, value = cookie.split(';')[0].split('=') cookieData = dict( (k.strip(), v.strip()) for k, v in (item.split("=") for item in cookie.split(";"))) cookie = cookie_helper.create_cookie(name, value, domain=cookieData['domain'], expires=sys.maxint, discard=False) cookie_jar.set_cookie(cookie) request = mechanize.Request(url) for key in headers: request.add_header(key, headers[key]) try: response = opener.open(request) except mechanize.HTTPError as e: response = e return response
def login(self,className): """ Login into coursera and obtain the necessary session cookies. """ hn,fn = tempfile.mkstemp() cookies = cookielib.LWPCookieJar() handlers = [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies) ] # prepend a proxy handler if defined if(self.proxy): proxy = urllib2.ProxyHandler({'http': self.proxy}) handlers = [proxy] + handlers opener = urllib2.build_opener(*handlers) url = self.lecture_url_from_name(className) req = urllib2.Request(url) try: res = opener.open(req) except urllib2.HTTPError as e: if e.code == 404: raise Exception("Unknown class %s" % className) # get the csrf token csrfcookie = [c for c in cookies if c.name == "csrf_token"] if not csrfcookie: raise Exception("Failed to find csrf cookie") csrftoken = csrfcookie[0].value opener.close() # call the authenticator url: cj = cookielib.MozillaCookieJar(fn) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPHandler(), urllib2.HTTPSHandler()) opener.addheaders.append(('Cookie', 'csrftoken=%s' % csrftoken)) opener.addheaders.append(('Referer', 'https://accounts.coursera.org/signin')) opener.addheaders.append(('X-CSRFToken', csrftoken)) req = urllib2.Request(self.LOGIN_URL) data = urllib.urlencode({'email': self.username,'password': self.password}) req.add_data(data) try: opener.open(req) except urllib2.HTTPError as e: if e.code == 401: raise Exception("Invalid username or password") # check if we managed to login sessionid = [c.name for c in cj if c.name == "CAUTH"] if not sessionid: raise Exception("Failed to authenticate as %s" % self.username) # all should be ok now, mechanize can handle the rest if we give it the # cookies br = mechanize.Browser() #br.set_debug_http(True) #br.set_debug_responses(False) #br.set_debug_redirects(True) br.set_handle_robots(False) br.set_cookiejar(cj) if self.proxy: br.set_proxies({"http":self.proxy}) self.browser = br # also use this cookiejar for other mechanize operations (e.g., urlopen) opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener)
def getNewToken(self): import mechanize #@UnresolvedImport br = mechanize.Browser() __addon__ = xbmcaddon.Addon(id='script.facebook.media') cookiesPath = os.path.join(xbmc.translatePath(__addon__.getAddonInfo('profile')),'cache','cookies') LOG('Cookies will be saved to: ' + cookiesPath) cookies = mechanize.LWPCookieJar(cookiesPath) if os.path.exists(cookiesPath): cookies.load() self.cookieJar = cookies opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) mechanize.install_opener(opener) br.set_cookiejar(self.cookieJar) br._ua_handlers["_cookies"].cookiejar.clear() br.set_handle_robots(False) agent = 'XBMC/{0} Facebook-Media/{1}'.format(xbmc.getInfoLabel('System.BuildVersion'),self.version) LOG('Setting User Agent: {0}'.format(agent)) br.addheaders = [('User-agent',agent)] scope = '' if self.scope: scope = '&scope=' + self.scope url = 'https://www.facebook.com/dialog/oauth?client_id='+self.client_id+\ '&redirect_uri='+self.redirect+\ '&type=user_agent&display=popup'+scope LOG(url) try: res = br.open(url) html = res.read() except: LOG("ERROR: TOKEN PAGE INITIAL READ") raise script = False try: #check for login form br.select_form(nr=0) LOG("HTML") except: self.genericError() script = True LOG("SCRIPT") if script: #no form, maybe we're logged in and the token is in javascript on the page url = res.geturl() token = self.extractTokenFromURL(url) if not token: token = self.parseTokenFromScript(html) else: try: #fill out the form and submit br['email'] = self.login_email br['pass'] = self.login_pass res = br.submit() url = res.geturl() LOG("FORM") except: LOG("FORM ERROR") raise script = False token = self.extractTokenFromURL(url) html = self.browserRead(res,'-noscript') if not token: #if 'class="checkpoint"' in html: token = self.handleLoginNotificationCrap(br) if not token: script = True if script: LOG("SCRIPT TOKEN") #no token in the url, let's try to parse it from javascript on the page try: __addon__ = xbmcaddon.Addon(id='script.facebook.media') htmlFile = os.path.join(xbmc.translatePath(__addon__.getAddonInfo('profile')),'cache','DEBUG_HTML.html') open(htmlFile,'w').write(html) LOG('html output written to: ' + htmlFile) except: pass token = self.parseTokenFromScript(html) token = urllib.unquote(token.decode('unicode-escape')) if not self.tokenIsValid(token): #if script: LOG("HTML:" + html) return False LOG("\n|--------------------\n|TOKEN: %s\n|--------------------" % token) self.saveToken(token) if self.cookieJar is not None: self.cookieJar.save() return token
def retrieve_product_links(self): cookies = mechanize.CookieJar() opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookies)) opener.addheaders = [('User-agent', 'Mozilla/5.0 (MyProgram/0.1)'), ('From', '*****@*****.**')] mechanize.install_opener(opener) url_buscar_productos = '/cl/' product_links = [] url_base = 'http://www.dell.com' # Start home url_extensions = [ 'p/laptops?cat=laptops', ] for url_extension in url_extensions: url_webpage = url_base + url_buscar_productos + url_extension r = mechanize.urlopen(url_webpage) soup = BeautifulSoup(r.read()) notebook_lines_container = soup.find('div', 'tabschegoryGroups') notebook_lines = notebook_lines_container.findAll('div', recursive=False) notebook_urls = [] for line in notebook_lines: for container in line.findAll('div', 'prodImg'): link = container.find('a')['href'].replace('pd', 'fs') notebook_urls.append(url_base + link) for url in notebook_urls: for url in self.retrieve_line_links(url): product_links.append([url, 'Notebook']) # Start business url_extensions = [ 'empresas/p/laptops', ] for url_extension in url_extensions: url_webpage = url_base + url_buscar_productos + url_extension r = mechanize.urlopen(url_webpage) soup = BeautifulSoup(r.read()) product_containers = soup.findAll('div', 'carouselProduct') for container in product_containers: url = url_base + container.find('a')['href'] for url in self.retrieve_enteprise_links(url): product_links.append([url, 'Notebook']) # Start Monitor url_extensions = [ '/content/products/compare.aspx/19_22widescreen' '?c=cl&cs=cldhs1&l=es&s=dhs', '/content/products/compare.aspx/23_30widescreen' '?c=cl&cs=cldhs1&l=es&s=dhs', '/cl/es/empresas/Monitores/19_15widescreen/cp.aspx' '?refid=19_15widescreen&s=bsd&cs=clbsdt1', '/cl/es/empresas/Monitores/22_20widescreen/cp.aspx' '?refid=22_20widescreen&s=bsd&cs=clbsdt1', '/cl/es/empresas/Monitores/30_24widescreen/cp.aspx' '?refid=30_24widescreen&s=bsd&cs=clbsdt1', '/cl/es/empresas/Monitores/20_19flatpanel/cp.aspx' '?refid=20_19flatpanel&s=bsd&cs=clbsdt1', ] for url_extension in url_extensions: url_webpage = url_base + url_extension r = mechanize.urlopen(url_webpage) soup = BeautifulSoup(r.read()) links = soup.findAll('a', {'class': 'lnk'}) for link in links: if 'configure' in link['href']: product_links.append([link['href'], 'Screen']) return product_links