Ejemplo n.º 1
0
 def __open(self, url):
   req = Request(url, headers = {
     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
     'Accept-Encoding': 'none',
     'Accept-Language': 'en-US,en;q=0.8',
     'Connection': 'keep-alive'})
   
   start = dt.now()
   
   req = None
   try:
     req = urlopen(url)
     self.html_src = req.read()
   except Exception as e:
     self.req_err = e
   else:
     if self.req_err is not None:
       req.close()
   
   self.req_time = dt.now() - start
     
   return self.html_src
Ejemplo n.º 2
0
   print 'Logged in to FitDay. Saved cookie to '+COOKIE_FILE+'.'

# Download weight data
nexturl = 'https://www.fitday.com/fitness/WeightHistory.html'

while nexturl != '':
   # Load the cookie and visit the URL of interest.
   cookieJar.load(COOKIE_FILE)
   while True:
      try:
         req = urlopen(nexturl)
         break
      except:
         print "Error reading the URL. Trying again..."
   htmlSource = req.read()
   req.close()
   
   # Filter the result a bit.
   htmlSource = htmlSource[11:]
   htmlSource = htmlSource.replace('\\"','"')
   htmlSource = htmlSource.replace('\\n','\n')
   htmlSource = htmlSource.replace('\\t','\t')
   
   soup = BeautifulSoup(''.join(htmlSource))
   
   to_extract = soup.findAll('script') # removing JS
   for item in to_extract:
      item.extract()
   
   weight_table = soup.find('div', {'class' : 'ListView'}).table