def iriToUri(self, iri): import urlparse parts = urlparse.urlparse(iri.decode('utf-8')) return urlparse.urlunparse( part.encode('idna') if parti == 1 else self.urlEncodeNonAscii(part.encode('utf-8')) for parti, part in enumerate(parts))
def add_params(url, params): import urllib import urlparse url_parts = list(urlparse.urlparse(url)) query = dict(urlparse.parse_qsl(url_parts[4])) query.update(params) url_parts[4] = urllib.urlencode(query) return urlparse.urlunparse(url_parts)
def clean_url(self, url=None): import cgi import urlparse import urllib if not url: url = self.url u = urlparse.urlparse(url) qs = cgi.parse_qs(u[4]) qs = dict((k, v) for k, v in qs.iteritems() if not k.startswith('utm_')) u = u._replace(query=urllib.urlencode(qs, True)) url = urlparse.urlunparse(u) url = url.replace('#!', '?_escaped_fragment_=') self.logger.info("cleaned url : %s" % url) return url
def download_file(datafile, scraped_title, book, page, maxpage): with open(datafile) as f: logging.info("Parsing %s, creating download url" % datafile) lines = f.readlines() dw_options = parse_qs(lines[0]) title = dw_options["title"][0] if title != scraped_title: logging.info("Found real title: %s" % (title,)) logging.info("Parsed data for book '%s'" % (title,)) url = dw_options["assemble_url"][0] params = {} for param in ["user_id","product_id","codec", "awtype","cust_id"]: if dw_options[param][0] == "LC_64_22050_stereo": params[param] = "LC_64_22050_ster" else: params[param] = dw_options[param][0] url_parts = list(urlparse.urlparse(url)) query = dict(urlparse.parse_qsl(url_parts[4])) query.update(params) url_parts[4] = urlencode(query) url = urlparse.urlunparse(url_parts) logging.info("Book URL: %s" % url) logging.info("Downloading file data") request_head = HeadRequest(url) request_head.add_header('User-Agent', 'Audible ADM 6.6.0.19;Windows Vista Service Pack 1 Build 7601') tries = 0 head_ok = False while head_ok == False: try: head = urllib2.urlopen(request_head) head_ok = True except urllib2.HTTPError as e_head: if tries < 5: tries = tries + 1 time.sleep(60) else: raise e_head except socket.error as se: if tries < 5: tries = tries + 1 time.sleep(60) else: raise e_head val, par = cgi.parse_header(head.info().dict['content-disposition']) filename = par['filename'].split("_")[0] filename = filename + "." + par['filename'].split(".")[-1] size = head.info().dict['content-length'] logging.info("Filename: %s" % filename) logging.info("Size: %s" % size) path = "%s%s" % (options.dw_dir, filename) logging.info("Book %s of 20 on page %s of %s" % (book, page, maxpage)) if os.path.isfile(path): logging.info("File %s exist, checking size", path) if int(size) == os.path.getsize(path): logging.info("File %s has correct size, not downloading" % (path,)) time.sleep(60) # sleep a minute to not be throttled return False else: logging.warning("File %s had unexpected size, downloading" % (path,)) else: logging.info("File %s does not exist, downloading" % (path,)) if True: opener = LyingFancyURLopener() local_filename, headers = opener.retrieve(url, path, reporthook=print_progress) #local_filename, headers = urlretrieve(url, path, reporthook=print_progress) #import pdb; pdb.set_trace() #filename = "" #try: # val, par = cgi.parse_header(headers.dict['content-disposition']) # filename = par['filename'].split("_")[0] # filename = filename + "." + par['filename'].split(".")[-1] #except KeyError: # import pdb; pdb.set_trace() #logging.info("Filename: %s" % filename) #logging.info("Size: %s" % size) #path = "%s%s" % (options.dw_dir, filename) #os.rename(local_filename,path) logging.info("Completed download of '%s' to %s" % (title, path)) else: logging.info("Completed download of '%s' to %s (not really)" % (title, path)) return True
def test_getservbyport(): import urlparse for port in [ 80, 443, 21, 70, 25, 143, 993, 110, 995 ]: print urlparse.urlunparse( (socket.getservbyport(port), 'example.com', '/', '', '', '') )