Example #1
0
def update_hlland(docdate, docno, password):
    u'更新地政觸控系統'
    url = 'http://att.hl.gov.tw/SENDATT_FILE/%s/376550400A_%s.zip' % \
          (docdate, docno)
    file = os.path.join('tmp', '376550400A_%s.zip' % docno)
    wget(url, file)
    unzip(file, password)
Example #2
0
    def save_to(self, path, mirror_prefix=None):
        fname = path + "/" + self.file

        if os.path.exists(fname):
            if md5sum(fname) == self.md5:
                # file already exists and checksum matches
                print(fname + " " + self.md5 + " exists, skipping")
                return True
            else:
                raise Exception(
                    "{} exists but checksum does not match {}".format(
                        fname, self.md5))

        while 1:
            # ues mirror if specified
            url = self.url if mirror_prefix is None else mirror_prefix + self.file

            try:
                suc = wget(url, path, retry=5, timeout=5)
            except KeyboardInterrupt:
                os.unlink(fname)
                return False

            if suc:
                if md5sum(fname) == self.md5:
                    return True
                else:
                    print("md5 not match, retrying")
                    continue  # md5 not match
            else:
                os.unlink(fname)
                return False
Example #3
0
 def operation():
     kwds['unlock'] = THREAD_LOCK
     THREAD_LOCK.acquire()
     try:
         return wget(*args, **kwds)
     finally:
         THREAD_LOCK.release()
Example #4
0
    def __init__(self):
        '''
		parameter:thread_dict['url']
		parameter:thread_dict['title']
		'''
        self.conn, self.cursor = connect()
        self.wget = wget()
Example #5
0
 def operation():
     kwds['unlock'] = THREAD_LOCK
     THREAD_LOCK.acquire()
     try:
         return wget(*args, **kwds)
     finally:
         THREAD_LOCK.release()
def inline_images(options, html):
    """
    @param options optparse Options object.

    @param html string.  HTML document string.
    """
    for m in css_url_re.finditer(html):
        url = gen_rel_url(options, m.group('url'))
        data = wget.wget(url)
        b64_data = base64.b64encode(data)
        html = html.replace(m.group('wholething'), 'url(data:image/%s;base64,%s)' % (url[-3:], b64_data), 1)
    for m in img_src_re.finditer(html):
        url = gen_rel_url(options, m.group('url'))
        data = wget.wget(url)
        b64_data = base64.b64encode(data)
        html = html.replace(m.group('wholething'), '<img src="data:image/%s;base64,%s"' % (url[-3:], b64_data), 1)
    return html
def include_bare_minimum_css(options, html, omit_bad_css=True):
    """
    @param options optparse Options object.

    @param html string.  HTML document string.

    @param omit_bad_css boolean.  Defaults to True.  When True, erroneous CSS
    will simply be omitted.  When False, any questionable CSS will be included.
    """
    d = pyquery.PyQuery(html)
    links_and_styles = d('link,style')
    favicon = None
    stylesheets = []
    for link_or_style_ele in links_and_styles:
        if link_or_style_ele.tag == 'link':
            if 'rel' in link_or_style_ele.attrib and link_or_style_ele.attrib['rel'] == 'shortcut icon':
                favicon = link_or_style_ele.attrib['href']
                continue
            stylesheets.append(wget.wget(gen_rel_url(options, link_or_style_ele.attrib['href'])))
        elif link_or_style_ele.tag == 'style':
            stylesheets.append(str(link_or_style_ele))

    out = ''

    for stylesheet in stylesheets:
        for stmt in stylesheet.split('}'):
            stmt = stmt.replace('\n', ' ').strip(' ') + '}'
            match = css_rule_re.match(stmt)
            if match:
                specifiers = match.group('specifiers')
                rule = match.group('rule')
                include_specifiers = ''
                for specifier in specifiers.split(','):
                    clean_specifier = re.sub(css_spec_cleaning_re, '', specifier)
                    # Interesting idea, but maybe not the best. -.v
                    #specifier = specifier.replace(' ', '>')
                    include_current = False
                    try:
                        matched_elements = d(clean_specifier)
                        if len(matched_elements):
                            # Then the element should be included.
                            include_current = True
                        #else:
                            # Otherwise it can be fairly safely omitted.
                            #pass
                    except Exception, e:#lxml.cssselect.ExpressionError:
                        # PQ can't handle it; it is likely bad CSS so omit it
                        # unless omit_bad_css suppression has been requested.
                        if omit_bad_css:
                            include_current = True
                    if include_current:
                        if len(include_specifiers):
                            include_specifiers += ',' + specifier
                        else:
                            include_specifiers = specifier
                if len(include_specifiers):
                    # Then this rule is used, so include it.
                    out += '%s %s\n' % (include_specifiers, rule)
Example #8
0
def inline_images(options, html):
    """
    @param options optparse Options object.

    @param html string.  HTML document string.
    """
    for m in css_url_re.finditer(html):
        url = gen_rel_url(options, m.group('url'))
        data = wget.wget(url)
        b64_data = base64.b64encode(data)
        html = html.replace(
            m.group('wholething'),
            'url(data:image/%s;base64,%s)' % (url[-3:], b64_data), 1)
    for m in img_src_re.finditer(html):
        url = gen_rel_url(options, m.group('url'))
        data = wget.wget(url)
        b64_data = base64.b64encode(data)
        html = html.replace(
            m.group('wholething'),
            '<img src="data:image/%s;base64,%s"' % (url[-3:], b64_data), 1)
    return html
#!/usr/bin/python2

import wget

if wget.wget("http://ipv4.download.thinkbroadband.com/5MB.zip") == 0:
    print "yo"



def download(collection, basename):
    baseURL = 'http://www.cise.ufl.edu/research/sparse/mat/'
    matname = '{0}.mat'.format(basename)
    if not os.path.exists(matname):
        assert collection is not None, "Invalid collection for download"
        wget('{0}/{1}/{2}.mat'.format(baseURL, collection, basename))
Example #11
0
def download(docno):
    fs = ["#{docno}-01.tif", "#{docno}之附件.tif"]
    url = 'http://www.fdc.gov.tw/public/doc/%s-01.tif' % docno[0:-1]
    local = os.path.join('tmp', '%s.tif' % docno[0:-1])
    wget(url, local)
Example #12
0
#  Read Tabla1.xlsx file which contains some soccer championship scores.
#  This file has two columns, Team and Score. Calculate for each team the goal difference and print them


url = "https://raw.githubusercontent.com/IEEESBITBA/Curso-Python/master/Clase_3_datos/Tabla1.xlsx"
import wget as w
w.wget(url)


import pandas as pd
file = pd.read_excel("Tabla1.xlsx")
data = archivo.to_dict("records")


for i in range(len(data)):
    difference = data[i]["Goles a favor"]-data[i]["Goles en contra"]
    sign = "in favor"
    if difference < 0:
        difference = difference * (-1)
        sign = "against"
    print(data[i]["Equipo"],"has a difference of", difference, "goals", sign)


    os.mkdir("images")
except:
    pass

# Download all images
counter         = 1
total           = frog.image_count
download_errors = []
for img in frog.images["images"]:
    # save image in a subfolder
    saveto = os.path.join("images", img["original_filename"])
    print str(counter)+" of "+str(total)+" ("+"{0:.2f}".format(float(counter)/total)+"%): "+img["original_filename"]
    counter += 1

    # invoke wget for downloading
    process = wget(img["direct_link"], saveto, frog.client.Cookies)
    process.wait()
    
    # remember all failed downloads
    if process.returncode != 0:
        download_errors.append(img["direct_link"])

print "Download completed."

# Print a list of the failed downloads
if len(download_errors) > 0:
    print "\nThere were errors attempting to download the following files:"
    for url in download_errors:
        print url
    print ""
Example #14
0
"""
Copyright (c) 2011-2013 F-Secure
See LICENSE for details
"""

import os, sys, traceback
import wget

if __name__ == '__main__':
    if len(sys.argv) < 2:
        wget.report_error("Use runurl.py url")

    url = sys.argv[1]
    wget.wget(url)
    try:
        filename = url.split('/')[-1].split('#')[0].split('?')[0]
        os.system("start " + filename)
    except Exception, ex:
        msg = "Error while running url %s\n\n%s\n\n%s" % (str(url), str(ex), traceback.format_exc())
        wget.report_error(msg)
    
Example #15
0
                            include_current = True
                    if include_current:
                        if len(include_specifiers):
                            include_specifiers += ',' + specifier
                        else:
                            include_specifiers = specifier
                if len(include_specifiers):
                    # Then this rule is used, so include it.
                    out += '%s %s\n' % (include_specifiers, rule)

    d('link,style').replaceWith('')
    d('head').append('<style type="text/css">%s</style>' % out)

    if favicon is not None:
        try:
            favicon_bin = wget.wget(favicon)
            favicon_b64 = base64.b64encode(favicon_bin)
            d('head').append(
                '<link id="favicon" rel="shortcut icon" type="image/png" href="data:image/png;base64,%s">'
                % favicon_b64)
        except Exception, e:
            #print 'error: favicon integration failed'
            pass
    return str(d)


css_url_re = re.compile(
    r'''(?P<wholething>url\s*\(\s*['"]?(?P<url>[^\)'"]*)['"]?\s*\))''',
    re.I | re.M)
img_src_re = re.compile(
    r'''(?P<wholething><\s*img [^>]*src=['"](?P<url>[^'"]*)['"])''',
Example #16
0
def download(docno):
    fs = ["#{docno}-01.tif", "#{docno}之附件.tif"]
    url = 'http://www.fdc.gov.tw/public/doc/%s-01.tif' % docno[0:-1]
    local = os.path.join('tmp', '%s.tif' % docno[0:-1])
    wget(url, local)
                            include_current = True
                    if include_current:
                        if len(include_specifiers):
                            include_specifiers += ',' + specifier
                        else:
                            include_specifiers = specifier
                if len(include_specifiers):
                    # Then this rule is used, so include it.
                    out += '%s %s\n' % (include_specifiers, rule)

    d('link,style').replaceWith('')
    d('head').append('<style type="text/css">%s</style>' % out)

    if favicon is not None:
        try:
            favicon_bin = wget.wget(favicon)
            favicon_b64 = base64.b64encode(favicon_bin)
            d('head').append('<link id="favicon" rel="shortcut icon" type="image/png" href="data:image/png;base64,%s">' % favicon_b64)
        except Exception, e:
            #print 'error: favicon integration failed'
            pass
    return str(d)

css_url_re = re.compile(r'''(?P<wholething>url\s*\(\s*['"]?(?P<url>[^\)'"]*)['"]?\s*\))''', re.I | re.M)
img_src_re = re.compile(r'''(?P<wholething><\s*img [^>]*src=['"](?P<url>[^'"]*)['"])''', re.I | re.M)

def inline_images(options, html):
    """
    @param options optparse Options object.

    @param html string.  HTML document string.
Example #18
0
def include_bare_minimum_css(options, html, omit_bad_css=True):
    """
    @param options optparse Options object.

    @param html string.  HTML document string.

    @param omit_bad_css boolean.  Defaults to True.  When True, erroneous CSS
    will simply be omitted.  When False, any questionable CSS will be included.
    """
    d = pyquery.PyQuery(html)
    links_and_styles = d('link,style')
    favicon = None
    stylesheets = []
    for link_or_style_ele in links_and_styles:
        if link_or_style_ele.tag == 'link':
            if 'rel' in link_or_style_ele.attrib and link_or_style_ele.attrib[
                    'rel'] == 'shortcut icon':
                favicon = link_or_style_ele.attrib['href']
                continue
            stylesheets.append(
                wget.wget(
                    gen_rel_url(options, link_or_style_ele.attrib['href'])))
        elif link_or_style_ele.tag == 'style':
            stylesheets.append(str(link_or_style_ele))

    out = ''

    for stylesheet in stylesheets:
        for stmt in stylesheet.split('}'):
            stmt = stmt.replace('\n', ' ').strip(' ') + '}'
            match = css_rule_re.match(stmt)
            if match:
                specifiers = match.group('specifiers')
                rule = match.group('rule')
                include_specifiers = ''
                for specifier in specifiers.split(','):
                    clean_specifier = re.sub(css_spec_cleaning_re, '',
                                             specifier)
                    # Interesting idea, but maybe not the best. -.v
                    #specifier = specifier.replace(' ', '>')
                    include_current = False
                    try:
                        matched_elements = d(clean_specifier)
                        if len(matched_elements):
                            # Then the element should be included.
                            include_current = True
                        #else:
                        # Otherwise it can be fairly safely omitted.
                        #pass
                    except Exception, e:  #lxml.cssselect.ExpressionError:
                        # PQ can't handle it; it is likely bad CSS so omit it
                        # unless omit_bad_css suppression has been requested.
                        if omit_bad_css:
                            include_current = True
                    if include_current:
                        if len(include_specifiers):
                            include_specifiers += ',' + specifier
                        else:
                            include_specifiers = specifier
                if len(include_specifiers):
                    # Then this rule is used, so include it.
                    out += '%s %s\n' % (include_specifiers, rule)
def download(collection, basename):
    baseURL = 'http://www.cise.ufl.edu/research/sparse/mat/'
    matname = '{0}.mat'.format(basename)
    if not os.path.exists(matname):
        assert collection is not None, "Invalid collection for download"
        wget('{0}/{1}/{2}.mat'.format(baseURL, collection, basename))
Example #20
0
"""
Copyright (c) 2011-2013 F-Secure
See LICENSE for details
"""

import os, sys, traceback
import wget

if __name__ == '__main__':
    if len(sys.argv) < 2:
        wget.report_error("Use runurl.py url")

    url = sys.argv[1]
    wget.wget(url)
    try:
        filename = url.split('/')[-1].split('#')[0].split('?')[0]
        os.system("start " + filename)
    except Exception, ex:
        msg = "Error while running url %s\n\n%s\n\n%s" % (
            str(url), str(ex), traceback.format_exc())
        wget.report_error(msg)