def update_hlland(docdate, docno, password): u'更新地政觸控系統' url = 'http://att.hl.gov.tw/SENDATT_FILE/%s/376550400A_%s.zip' % \ (docdate, docno) file = os.path.join('tmp', '376550400A_%s.zip' % docno) wget(url, file) unzip(file, password)
def save_to(self, path, mirror_prefix=None): fname = path + "/" + self.file if os.path.exists(fname): if md5sum(fname) == self.md5: # file already exists and checksum matches print(fname + " " + self.md5 + " exists, skipping") return True else: raise Exception( "{} exists but checksum does not match {}".format( fname, self.md5)) while 1: # ues mirror if specified url = self.url if mirror_prefix is None else mirror_prefix + self.file try: suc = wget(url, path, retry=5, timeout=5) except KeyboardInterrupt: os.unlink(fname) return False if suc: if md5sum(fname) == self.md5: return True else: print("md5 not match, retrying") continue # md5 not match else: os.unlink(fname) return False
def operation(): kwds['unlock'] = THREAD_LOCK THREAD_LOCK.acquire() try: return wget(*args, **kwds) finally: THREAD_LOCK.release()
def __init__(self): ''' parameter:thread_dict['url'] parameter:thread_dict['title'] ''' self.conn, self.cursor = connect() self.wget = wget()
def inline_images(options, html): """ @param options optparse Options object. @param html string. HTML document string. """ for m in css_url_re.finditer(html): url = gen_rel_url(options, m.group('url')) data = wget.wget(url) b64_data = base64.b64encode(data) html = html.replace(m.group('wholething'), 'url(data:image/%s;base64,%s)' % (url[-3:], b64_data), 1) for m in img_src_re.finditer(html): url = gen_rel_url(options, m.group('url')) data = wget.wget(url) b64_data = base64.b64encode(data) html = html.replace(m.group('wholething'), '<img src="data:image/%s;base64,%s"' % (url[-3:], b64_data), 1) return html
def include_bare_minimum_css(options, html, omit_bad_css=True): """ @param options optparse Options object. @param html string. HTML document string. @param omit_bad_css boolean. Defaults to True. When True, erroneous CSS will simply be omitted. When False, any questionable CSS will be included. """ d = pyquery.PyQuery(html) links_and_styles = d('link,style') favicon = None stylesheets = [] for link_or_style_ele in links_and_styles: if link_or_style_ele.tag == 'link': if 'rel' in link_or_style_ele.attrib and link_or_style_ele.attrib['rel'] == 'shortcut icon': favicon = link_or_style_ele.attrib['href'] continue stylesheets.append(wget.wget(gen_rel_url(options, link_or_style_ele.attrib['href']))) elif link_or_style_ele.tag == 'style': stylesheets.append(str(link_or_style_ele)) out = '' for stylesheet in stylesheets: for stmt in stylesheet.split('}'): stmt = stmt.replace('\n', ' ').strip(' ') + '}' match = css_rule_re.match(stmt) if match: specifiers = match.group('specifiers') rule = match.group('rule') include_specifiers = '' for specifier in specifiers.split(','): clean_specifier = re.sub(css_spec_cleaning_re, '', specifier) # Interesting idea, but maybe not the best. -.v #specifier = specifier.replace(' ', '>') include_current = False try: matched_elements = d(clean_specifier) if len(matched_elements): # Then the element should be included. include_current = True #else: # Otherwise it can be fairly safely omitted. #pass except Exception, e:#lxml.cssselect.ExpressionError: # PQ can't handle it; it is likely bad CSS so omit it # unless omit_bad_css suppression has been requested. if omit_bad_css: include_current = True if include_current: if len(include_specifiers): include_specifiers += ',' + specifier else: include_specifiers = specifier if len(include_specifiers): # Then this rule is used, so include it. out += '%s %s\n' % (include_specifiers, rule)
def inline_images(options, html): """ @param options optparse Options object. @param html string. HTML document string. """ for m in css_url_re.finditer(html): url = gen_rel_url(options, m.group('url')) data = wget.wget(url) b64_data = base64.b64encode(data) html = html.replace( m.group('wholething'), 'url(data:image/%s;base64,%s)' % (url[-3:], b64_data), 1) for m in img_src_re.finditer(html): url = gen_rel_url(options, m.group('url')) data = wget.wget(url) b64_data = base64.b64encode(data) html = html.replace( m.group('wholething'), '<img src="data:image/%s;base64,%s"' % (url[-3:], b64_data), 1) return html
#!/usr/bin/python2 import wget if wget.wget("http://ipv4.download.thinkbroadband.com/5MB.zip") == 0: print "yo"
def download(collection, basename): baseURL = 'http://www.cise.ufl.edu/research/sparse/mat/' matname = '{0}.mat'.format(basename) if not os.path.exists(matname): assert collection is not None, "Invalid collection for download" wget('{0}/{1}/{2}.mat'.format(baseURL, collection, basename))
def download(docno): fs = ["#{docno}-01.tif", "#{docno}之附件.tif"] url = 'http://www.fdc.gov.tw/public/doc/%s-01.tif' % docno[0:-1] local = os.path.join('tmp', '%s.tif' % docno[0:-1]) wget(url, local)
# Read Tabla1.xlsx file which contains some soccer championship scores. # This file has two columns, Team and Score. Calculate for each team the goal difference and print them url = "https://raw.githubusercontent.com/IEEESBITBA/Curso-Python/master/Clase_3_datos/Tabla1.xlsx" import wget as w w.wget(url) import pandas as pd file = pd.read_excel("Tabla1.xlsx") data = archivo.to_dict("records") for i in range(len(data)): difference = data[i]["Goles a favor"]-data[i]["Goles en contra"] sign = "in favor" if difference < 0: difference = difference * (-1) sign = "against" print(data[i]["Equipo"],"has a difference of", difference, "goals", sign)
os.mkdir("images") except: pass # Download all images counter = 1 total = frog.image_count download_errors = [] for img in frog.images["images"]: # save image in a subfolder saveto = os.path.join("images", img["original_filename"]) print str(counter)+" of "+str(total)+" ("+"{0:.2f}".format(float(counter)/total)+"%): "+img["original_filename"] counter += 1 # invoke wget for downloading process = wget(img["direct_link"], saveto, frog.client.Cookies) process.wait() # remember all failed downloads if process.returncode != 0: download_errors.append(img["direct_link"]) print "Download completed." # Print a list of the failed downloads if len(download_errors) > 0: print "\nThere were errors attempting to download the following files:" for url in download_errors: print url print ""
""" Copyright (c) 2011-2013 F-Secure See LICENSE for details """ import os, sys, traceback import wget if __name__ == '__main__': if len(sys.argv) < 2: wget.report_error("Use runurl.py url") url = sys.argv[1] wget.wget(url) try: filename = url.split('/')[-1].split('#')[0].split('?')[0] os.system("start " + filename) except Exception, ex: msg = "Error while running url %s\n\n%s\n\n%s" % (str(url), str(ex), traceback.format_exc()) wget.report_error(msg)
include_current = True if include_current: if len(include_specifiers): include_specifiers += ',' + specifier else: include_specifiers = specifier if len(include_specifiers): # Then this rule is used, so include it. out += '%s %s\n' % (include_specifiers, rule) d('link,style').replaceWith('') d('head').append('<style type="text/css">%s</style>' % out) if favicon is not None: try: favicon_bin = wget.wget(favicon) favicon_b64 = base64.b64encode(favicon_bin) d('head').append( '<link id="favicon" rel="shortcut icon" type="image/png" href="data:image/png;base64,%s">' % favicon_b64) except Exception, e: #print 'error: favicon integration failed' pass return str(d) css_url_re = re.compile( r'''(?P<wholething>url\s*\(\s*['"]?(?P<url>[^\)'"]*)['"]?\s*\))''', re.I | re.M) img_src_re = re.compile( r'''(?P<wholething><\s*img [^>]*src=['"](?P<url>[^'"]*)['"])''',
include_current = True if include_current: if len(include_specifiers): include_specifiers += ',' + specifier else: include_specifiers = specifier if len(include_specifiers): # Then this rule is used, so include it. out += '%s %s\n' % (include_specifiers, rule) d('link,style').replaceWith('') d('head').append('<style type="text/css">%s</style>' % out) if favicon is not None: try: favicon_bin = wget.wget(favicon) favicon_b64 = base64.b64encode(favicon_bin) d('head').append('<link id="favicon" rel="shortcut icon" type="image/png" href="data:image/png;base64,%s">' % favicon_b64) except Exception, e: #print 'error: favicon integration failed' pass return str(d) css_url_re = re.compile(r'''(?P<wholething>url\s*\(\s*['"]?(?P<url>[^\)'"]*)['"]?\s*\))''', re.I | re.M) img_src_re = re.compile(r'''(?P<wholething><\s*img [^>]*src=['"](?P<url>[^'"]*)['"])''', re.I | re.M) def inline_images(options, html): """ @param options optparse Options object. @param html string. HTML document string.
def include_bare_minimum_css(options, html, omit_bad_css=True): """ @param options optparse Options object. @param html string. HTML document string. @param omit_bad_css boolean. Defaults to True. When True, erroneous CSS will simply be omitted. When False, any questionable CSS will be included. """ d = pyquery.PyQuery(html) links_and_styles = d('link,style') favicon = None stylesheets = [] for link_or_style_ele in links_and_styles: if link_or_style_ele.tag == 'link': if 'rel' in link_or_style_ele.attrib and link_or_style_ele.attrib[ 'rel'] == 'shortcut icon': favicon = link_or_style_ele.attrib['href'] continue stylesheets.append( wget.wget( gen_rel_url(options, link_or_style_ele.attrib['href']))) elif link_or_style_ele.tag == 'style': stylesheets.append(str(link_or_style_ele)) out = '' for stylesheet in stylesheets: for stmt in stylesheet.split('}'): stmt = stmt.replace('\n', ' ').strip(' ') + '}' match = css_rule_re.match(stmt) if match: specifiers = match.group('specifiers') rule = match.group('rule') include_specifiers = '' for specifier in specifiers.split(','): clean_specifier = re.sub(css_spec_cleaning_re, '', specifier) # Interesting idea, but maybe not the best. -.v #specifier = specifier.replace(' ', '>') include_current = False try: matched_elements = d(clean_specifier) if len(matched_elements): # Then the element should be included. include_current = True #else: # Otherwise it can be fairly safely omitted. #pass except Exception, e: #lxml.cssselect.ExpressionError: # PQ can't handle it; it is likely bad CSS so omit it # unless omit_bad_css suppression has been requested. if omit_bad_css: include_current = True if include_current: if len(include_specifiers): include_specifiers += ',' + specifier else: include_specifiers = specifier if len(include_specifiers): # Then this rule is used, so include it. out += '%s %s\n' % (include_specifiers, rule)
""" Copyright (c) 2011-2013 F-Secure See LICENSE for details """ import os, sys, traceback import wget if __name__ == '__main__': if len(sys.argv) < 2: wget.report_error("Use runurl.py url") url = sys.argv[1] wget.wget(url) try: filename = url.split('/')[-1].split('#')[0].split('?')[0] os.system("start " + filename) except Exception, ex: msg = "Error while running url %s\n\n%s\n\n%s" % ( str(url), str(ex), traceback.format_exc()) wget.report_error(msg)