Example #1
0
def http_download(download_url, outfile, proxy_url=None, proxy_port=None):

    if proxy_url:
        proxy = "{}:{}".format(proxy_url, proxy_port)
        mainlog.info("Using a proxy : {}".format(proxy))

        urlopener = build_opener(ProxyHandler({
            'https': proxy,
            'http': proxy
        }), HTTPRedirectHandler())
    else:
        mainlog.info("Not using a proxy")
        urlopener = build_opener(HTTPHandler(), HTTPSHandler(),
                                 HTTPRedirectHandler())

    urlopener.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0'
    )]

    datasource = urlopener.open(download_url)

    out = open(outfile, 'wb')
    while True:
        d = datasource.read(8192)
        # self.logger.debug("Downloaded {} bytes".format(len(d)))
        if not d:
            break
        else:
            out.write(d)
            out.flush()
    out.close()
    datasource.close()
Example #2
0
 def open(request):
     request = request_vim_to_python(request)
     rhandler = HTTPRedirectHandler()
     rhandler.max_redirections = request['max_redirect']
     opener = build_opener(rhandler)
     if request['username']:
         passmgr = HTTPPasswordMgrWithDefaultRealm()
         passmgr.add_password(
             None,
             request['url'],
             request['username'],
             request['password'],
         )
         opener.add_handler(HTTPBasicAuthHandler(passmgr))
         opener.add_handler(HTTPDigestAuthHandler(passmgr))
     req = Request(
         url=request['url'],
         data=request['data'],
         headers=request['headers'],
         method=request['method'],
     )
     if request['gzip_decompress']:
         req.add_header('Accept-encoding', 'gzip')
     try:
         res = retry(tries=request['retry'])(opener.open)(
             req, timeout=request['timeout'])
     except HTTPError as e:
         res = e
     if not hasattr(res, 'version'):
         # urllib2 does not have 'version' field
         import httplib
         res.version = httplib.HTTPConnection._http_vsn
     response_status = "HTTP/%s %d %s\n" % (
         '1.1' if res.version == 11 else '1.0',
         res.code,
         res.msg,
     )
     response_headers = str(res.headers)
     response_body = res.read()
     if (request['gzip_decompress']
             and res.headers.get('Content-Encoding') == 'gzip'):
         response_body = gzip_decompress(response_body)
     if hasattr(res.headers, 'get_content_charset'):
         # Python 3
         response_encoding = res.headers.get_content_charset()
     else:
         # Python 2
         response_encoding = res.headers.getparam('charset')
     response_body = response_body.decode(response_encoding)
     return (
         request['url'],
         response_status + response_headers,
         response_body,
     )
Example #3
0
 def http_error_302(self, req, fp, code, msg, headers):
     previous_url = req.url
     result = HTTPRedirectHandler.http_error_302(
         self, req, fp, code, msg, headers)
     if not hasattr(result, "redirected_via"):
         result.redirected_via = []
     result.redirected_via.append(previous_url)
Example #4
0
 def http_error_302(self, req, fp, code, msg, headers):
     previous_url = req.url
     result = HTTPRedirectHandler.http_error_302(
         self, req, fp, code, msg, headers)
     if not hasattr(result, "redirected_via"):
         result.redirected_via = []
     result.redirected_via.append(previous_url)
Example #5
0
 def _handle_redirect(self, req, fp, code, msg, headers):
     url_unescaped = headers.get('Location')
     new_url = quote_url(url_unescaped)
     headers.replace_header('Location', new_url)
     result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg,
                                                 headers)
     return result
 def http_error_302(self, req, fp, code, msg, headers):
     if self.throw:
         self.location = headers.getheader('Location')
         raise RedirectionException()
     else:
         return HTTPRedirectHandler.http_error_302(self, req, fp, code, msg,
                                                   headers)
Example #7
0
 def http_error_301(self, req, fp, code, msg, headers):
     new_url = req.get_full_url()
     result = HTTPRedirectHandler.http_error_301(
         self, req, fp, code, msg, headers)
     if not hasattr(result, "redirected_via"):
         result.redirected_via = []
     result.redirected_via.append(new_url)
Example #8
0
 def http_error_301(self, req, fp, code, msg, headers):
     new_url = req.get_full_url()
     result = HTTPRedirectHandler.http_error_301(
         self, req, fp, code, msg, headers)
     if not hasattr(result, "redirected_via"):
         result.redirected_via = []
     result.redirected_via.append(new_url)
Example #9
0
def auth(*, email, password, client_id, scope):
    def split_key_value(kv_pair):
        kv = kv_pair.split("=")
        return kv[0], kv[1]

    # Authorization form
    def auth_user(email, password, client_id, scope, opener):
        response = opener.open(
            "http://oauth.vk.com/oauth/authorize?" + \
            "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \
            "client_id=%s&scope=%s" % (client_id, ",".join(scope))
            )
        doc = response.read().decode('utf-8')
        parser = AuthFormParser()
        parser.feed(doc)
        parser.close()
        if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \
          "email" not in parser.params:
            raise RuntimeError("Something wrong")
        parser.params["email"] = email
        parser.params["pass"] = password
        if parser.method == "POST":
            response = opener.open(parser.url,
                                   urlencode(parser.params).encode('ascii'))
        else:
            raise NotImplementedError("Method '%s'" % parser.method)
        return response.read(), response.geturl()

    # Permission request form
    def give_access(doc, opener):
        parser = AuthFormParser()
        parser.feed(doc)
        parser.close()
        if not parser.form_parsed or parser.url is None:
            raise RuntimeError("Something wrong")
        if parser.method == "POST":
            response = opener.open(parser.url,
                                   urlencode(parser.params).encode('ascii'))
        else:
            raise NotImplementedError("Method '%s'" % parser.method)
        return response.geturl()

    if not isinstance(scope, list):
        scope = [scope]
    opener = build_opener(HTTPCookieProcessor(http.cookiejar.CookieJar()),
                          HTTPRedirectHandler())
    doc, url = auth_user(email, password, client_id, scope, opener)
    if urlparse(url).path != "/blank.html":
        # Need to give access to requested scope
        url = give_access(doc.decode('utf-8'), opener)
    if urlparse(url).path != "/blank.html":
        raise RuntimeError("Expected success here")
    answer = dict(
        split_key_value(kv_pair)
        for kv_pair in urlparse(url).fragment.split("&"))
    if "access_token" not in answer or "user_id" not in answer:
        raise RuntimeError("Missing some values in answer")

    return answer["access_token"], answer["user_id"]
Example #10
0
 def setup_method(self, method):
     self.cookies = CookieJar()
     self.opener = build_opener(HTTPRedirectHandler(),
                                HTTPHandler(debuglevel=0),
                                HTTPSHandler(debuglevel=0),
                                HTTPCookieProcessor(self.cookies))
     self.application_process = Process(target=main)
     self.application_process.start()
Example #11
0
 def http_error_302(self, req, fp, code, msg, headers):
     #self.log.info("http_error_302: code %s headers %s" % (code, headers))
     if 'location' in headers:
         newurl = headers['location']
         if newurl.startswith('mms:'):
             raise URLError("MMS REDIRECT:" + headers["Location"])
     return HTTPRedirectHandler.http_error_302(self, req, fp, code, msg,
                                               headers)
Example #12
0
 def redirect_request(self, req, fp, code, msg, headers, newurl):
     if PY2:
         # HTTPRedirectHandler is an old style class
         request = HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, newurl)
     else:
         request = super(S3HTTPRedirectHandler, self).redirect_request(req, fp, code, msg, headers, newurl)
     del request.headers['Authorization']
     return request
Example #13
0
 def redirect_request(self, request, fp, code, msg, headers, new_url):
     request_method = request.get_method()
     if str(
             code
     ) in self.redirect_codes and request_method in self.valid_methods:
         new_url = new_url.replace(' ', '%20')
         request = Request(new_url,
                           data=request.data,
                           headers=request.headers,
                           origin_req_host=request.get_origin_req_host(),
                           unverifiable=True)
         if self.method in self.valid_methods:
             if request.get_method() != self.method:
                 request.get_method = lambda: self.method
         return request
     else:
         HTTPRedirectHandler.redirect_request(request, fp, code, msg,
                                              headers, new_url)
Example #14
0
    def download(self, url, error_message, timeout, tries):
        http_proxy = self.setting.http_proxy
        https_proxy = self.setting.https_proxy
        if http_proxy or https_proxy:
            proxies = {}
            if http_proxy:
                proxies['http'] = http_proxy
                if not https_proxy:
                    proxies['https'] = http_proxy
            if https_proxy:
                proxies['https'] = https_proxy
            proxy_handler = ProxyHandler(proxies)
        else:
            proxy_handler = ProxyHandler()
        handlers = [proxy_handler, HTTPRedirectHandler()]

        # secure_url_match = re.match('^https://([^/]+)', url)
        # if secure_url_match != None:
        #   secure_domain = secure_url_match.group(1)
        #   bundle_path = self.check_certs(secure_domain, timeout)
        #   if not bundle_path:
        #       return False
        #   handlers.append(VerifiedHTTPSHandler(ca_certs=bundle_path))
        opener = build_opener(*handlers)

        while tries > 0:
            tries -= 1
            try:
                request = Request(
                    url, headers={"User-Agent": "OmniMarkup Downloader"})
                http_file = opener.open(request, timeout=timeout)
                return http_file.read()

            except HTTPException as e:
                log.warning('%s HTTP exception %s (%s) downloading %s.',
                            error_message, e.__class__.__name__, str(e), url)

            except HTTPError as e:
                # Bitbucket and Github ratelimit using 503 a decent amount
                if str(e.code) == '503':
                    log.warning(
                        'Downloading %s was rate limited, trying again', url)
                    continue
                log.warning('%s HTTP error %s downloading %s.', error_message,
                            str(e.code), url)

            except URLError as e:
                # Bitbucket and Github timeout a decent amount
                if str(e.reason) == 'The read operation timed out' or \
                        str(e.reason) == 'timed out':
                    log.warning('Downloading %s timed out, trying again', url)
                    continue
                log.warning('%s URL error %s downloading %s.', error_message,
                            str(e.reason), url)
            break
        return False
Example #15
0
    def http_error_302(self, req, res, code, msg, headers):
        '''Filter non-GET request before calling parent implementation.'''

        method = req.get_method()

        # Bail unless method is get
        if method != 'GET':
            return res

        # Let parent handle the rest
        return HTTPRedirectHandler.http_error_302(self, req, res, code, msg,
                                                  headers)
Example #16
0
    def __init__(self, data_path, **kwargs):
        if not validation.is_data_path(data_path):
            raise Exception('invalid data_path: %s' % data_path)

        self.cookie_jar = MozillaCookieJar(
            os.path.join(data_path, default.COOKIES_FILENAME))
        try:
            self.cookie_jar.load()
        except EnvironmentError:
            pass

        self.opener = build_opener(HTTPRedirectHandler(),
                                   HTTPCookieProcessor(self.cookie_jar))

        super(Session, self).__init__(**kwargs)
Example #17
0
    def redirect_request(self, req, res, code, msg, hdrs, newurl):
        response = {
            'url': req.get_full_url(),
            'headers': res.headers,
            'code': code,
            'msg': msg,
            'new_url': newurl
        }

        self.redirect_hdrs.append(response)

        nreq = HTTPRedirectHandler.redirect_request(self, req, res, code, msg,
                                                    hdrs, newurl)

        return nreq
Example #18
0
    def http_error_302(self, request, fp, code, message, headers):
        cookie = SimpleCookie()

        request_cookie = request.headers.get('Cookie')
        if request_cookie:
            cookie.load(request_cookie)

        set_cookie = headers.get('set-cookie')
        if set_cookie:
            for value in set_cookie:
                cookie.load(value)

        headers['Cookie'] = cookie.output(header='', sep='; ')

        redirect_handler = HTTPRedirectHandler.http_error_302(self, request, fp, code, message, headers)
        return inesHTTPError(request, redirect_handler, code, message, headers)
Example #19
0
    def redirect_request(self, req, fp, code, msg, headers, newurl):
        newreq = HTTPRedirectHandler.redirect_request(self, req, fp, code, msg,
                                                      headers, newurl)

        if 'Authorization' not in req.headers:
            return newreq

        src = urlparse(req.get_full_url()).hostname
        dest = urlparse(newreq.get_full_url()).hostname

        if dest != src:
            bot.debug('AuthRedirectHandler: stripping "Authorization" header '
                      "(%s != %s)" % (dest, src))
            del newreq.headers['Authorization']

        return newreq
Example #20
0
    def http_error_302(self, req, res, code, msg, headers):
        '''Filter non-GET request before calling parent implementation.'''

        method = req.get_method()

        # Bail unless method is get
        if method != 'GET':
            return res

        # Let parent handle the rest
        return HTTPRedirectHandler.http_error_302(
            self,
            req,
            res,
            code,
            msg,
            headers)
Example #21
0
 def login(self):
     if self.type == 'geonetwork':
         url = "%sgeonetwork/srv/en/xml.user.login" % self.base
         headers = {
             "Content-Type": "application/x-www-form-urlencoded",
             "Accept": "text/plain"
         }
         post = urlencode({
             "username": self.user,
             "password": self.password
         })
         request = Request(url, post, headers)
         self.opener = build_opener(HTTPCookieProcessor(),
                                    HTTPRedirectHandler())
         response = self.opener.open(request)
         doc = dlxml.fromstring(response.read())
         assert doc.tag == 'ok', "GeoNetwork login failed!"
         self.connected = True
Example #22
0
 def __init__(self, proxy=None):
     global USER_AGENT
     self.redirh = HTTPRedirectHandler()
     self.cookie = HTTPCookieProcessor()
     self.rawopen = build_opener(self.redirh, self.cookie)
     if proxy is None or self.no_proxy:
         self.opener = self.rawopen
     elif proxy == 'auto':
         # proxy.uku.im:8888
         #self.proxyh = ProxyHandler({'http': "http://211.155.86.25:8888"})
         #self.proxyh = ProxyHandler({'http': "proxy.uku.im:8888"})
         self.proxyh = ProxyHandler({'http': "https://secure.uku.im:8443"})
         #self.proxyh = ProxyHandler({'http': "https://proxy.uku.im:443"})
         self.opener = build_opener(self.proxyh, self.redirh, self.cookie)
     else:
         self.proxyh = ProxyHandler(proxy)
         self.opener = build_opener(self.proxyh, self.redirh, self.cookie)
     self.extra_headers = {"User-Agent": USER_AGENT}
Example #23
0
    def http_error_301(self, req, res, code, msg, headers):
        '''Update location and filter non-GET request before calling parent
        implementation.
        '''

        method = req.get_method()
        resource = req.resource

        # Update resource location
        if 'location' in headers:
            resource.location = headers['location']

        # Bail unless method is GET
        if method != 'GET':
            return res

        # Let parent handle the rest
        return HTTPRedirectHandler.http_error_301(self, req, res, code, msg,
                                                  headers)
Example #24
0
    def __init__(self, args):
        """ Start up... """
        self.args = args
        self.cj = http.cookiejar.MozillaCookieJar(COOKIES_FILENAME)
        if os.access(COOKIES_FILENAME, os.F_OK):
            self.cj.load(os.getcwd() + "/" + COOKIES_FILENAME)
        self.opener = build_opener(HTTPRedirectHandler(),
                                   HTTPHandler(debuglevel=0),
                                   HTTPSHandler(debuglevel=0),
                                   HTTPCookieProcessor(self.cj))
        self.opener.addheaders = [
            ('User-Agent',
             ('Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36'
              )),
            ('Accept',
             'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
             )
        ]

        if not os.path.exists(TMP_DIR):
            os.makedirs(TMP_DIR)
Example #25
0
    def __init__(self):
        self.user = bugtracker_user
        self.password = bugtracker_pass
        self.login_page = 'https://bugs.archlinux.org/index.php?do=authenticate'
        #self.target_page = 'http://bugs.archlinux.org/index.php?events%5B%5D=1&events%5B%5D=13&events%5B%5D=2&events%5B%5D=4&event_number=50&do=reports'
        #self.target_page = 'http://bugs.archlinux.org/index.php?events[]=1&events[]=13&events[]=2&events[]=4&fromdate=&todate=&event_number=50&project=0&do=reports&submit='
        self.target_page = 'https://bugs.archlinux.org/index.php?events%5B%5D=1&events%5B%5D=13&events%5B%5D=2&events%5B%5D=4&event_number=50&do=reports&project=0'
        self.cj = CookieJar()
        self.opener = build_opener(HTTPRedirectHandler(),
                                   HTTPHandler(debuglevel=0),
                                   HTTPSHandler(debuglevel=0),
                                   HTTPCookieProcessor(self.cj))
        # self.opener.addheaders = [
        #    ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
        #                   'Windows NT 5.2; .NET CLR 1.1.4322)'))
        # ]
        self.opener.addheaders = [('User-agent', 'Mozilla/5.0')]

        # need this twice - once to set cookies, once to log in...
        self.login()
        #self.login()
        self.old_events = set([])
Example #26
0
    def http_error_301(self, req, res, code, msg, headers):
        '''Update location and filter non-GET request before calling parent
        implementation.
        '''

        method = req.get_method()
        resource = req.resource

        # Update resource location
        if 'location' in headers:
            resource.location = headers['location']

        # Bail unless method is GET
        if method != 'GET':
            return res

        # Let parent handle the rest
        return HTTPRedirectHandler.http_error_301(
            self,
            req,
            res,
            code,
            msg,
            headers)
Example #27
0
class Page(object):
    verb_handler = HTTPHandler()
    if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
        verb_handler.set_http_debuglevel(2)
    redir_handler = HTTPRedirectHandler()
    opener = build_opener(verb_handler, redir_handler)

    def __init__(self):
        pass

    @staticmethod
    def unenscape_Google_bang_URL(old_URL):
        """
        See https://developers.google.com/webmasters\
                /ajax-crawling/docs/getting-started for more information
        """
        if old_URL.find('#!') != -1:
            return old_URL.replace('#!', '?_escaped_fragment_=')
        elif old_URL.startswith('https://groups.google.com/d/topic/'):
            # DEBUG:get_one_topic:URL collected =
            #     https://groups.google.com/d/topic/jbrout/dreCkob3KSs
            # DEBUG:__init__:root_URL =
            #     https://groups.google.com/forum/\
            #        ?_escaped_fragment_=topic/jbrout/dreCkob3KSs
            return old_URL.replace(
                'https://groups.google.com/d/',
                'https://groups.google.com/forum/?_escaped_fragment_=')
        else:
            return old_URL

    def _get_page_BS(self, URL):
        res = self.opener.open(self.unenscape_Google_bang_URL(URL))
        in_str = res.read()
        bs = BeautifulSoup(in_str)
        res.close()
        return bs
Example #28
0
 def http_error_303(self, req, res, code, msg, hdrs):
     # Let parent handle the rest
     return HTTPRedirectHandler.http_error_303(self, req, res, code, msg,
                                               hdrs)
Example #29
0
        request.add_unredirected_header('Authorization',
                                        'Bearer ' + auth_token)
        return self.parent.open(request, timeout=request.timeout)


# Got some help from this example https://gist.github.com/FiloSottile/2077115
class HeadRequest(Request):
    def get_method(self):
        return "HEAD"


better_urllib_get = OpenerDirector()
better_urllib_get.addheaders = DEFAULT_HEADERS.copy()
better_urllib_get.add_handler(HTTPHandler())
better_urllib_get.add_handler(HTTPSHandler())
better_urllib_get.add_handler(HTTPRedirectHandler())
better_urllib_get.add_handler(SocketFileHandler())
better_urllib_get.add_handler(Oauth2TokenAuthHandler())


class RegistryError(Exception):
    def __init__(self, response):
        self.response_obj = response


# Util functions
#############################################################################################
def parse_thresholds(spec, include_units=True, units_required=True):
    """
    Given a spec string break it up into ':' separated chunks. Convert strings to ints as it makes sense
Example #30
0
 def redirect_request(self, req, fp, code, msg, headers, newurl):
     new_req = HTTPRedirectHandler.redirect_request(self, req, fp, code,
                                                    msg, headers, newurl)
     req.redirect_code = code
     return new_req
Example #31
0
import pickle
import time
import requests

from celerycrawler import settings
from datetime import datetime
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser
from urllib.request import urlopen, Request, HTTPError
from urllib.request import install_opener, build_opener, HTTPRedirectHandler
from couchdb.mapping import Document, TextField, DateTimeField, ListField, FloatField
from django.core.cache import cache

install_opener(build_opener(HTTPRedirectHandler()))

class Page(Document):
    type = TextField(default="page")
    url = TextField()
    raw = TextField()
    content = TextField()
    links = ListField(TextField())
    rank = FloatField(default=0)
    last_checked = DateTimeField(default=datetime.now)

    def is_valid(self):
        return (datetime.now() - self.last_checked).days < 7

    def update(self):
        print("updating page")
        
        parse = urlparse(self.url)
Example #32
0
 def redirect_request(self, req, fp, code, msg, headers, newurl):
     new_req = HTTPRedirectHandler.redirect_request(self, req, fp, code,
                                                    msg, headers, newurl)
     req.redirect_code = code
     return new_req
Example #33
0
 def _handle_redirect(self, req, fp, code, msg, headers):
     url_unescaped = headers.get('Location')
     new_url = quote_url(url_unescaped)
     headers.replace_header('Location', new_url)
     result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
     return result
Example #34
0
 def http_error_307(self, req, res, code, msg, hdrs):
     #return None
     self.counter['307'] += 1
     if self.counter['307'] <= 1:
         return HTTPRedirectHandler.http_error_307(self, req, res, code,
                                                   msg, hdrs)
 def redirect_request(self, req, fp, code, msg, hdrs, newurl):
     self.last_url = newurl
     r = HTTPRedirectHandler.redirect_request(
         self, req, fp, code, msg, hdrs, newurl)
     r.get_method = lambda: 'HEAD'
     return r
Example #36
0
 def __init__(self):
     HTTPRedirectHandler.__init__(self)
     self.redirect_hdrs = []
     self.counter = collections.Counter()
Example #37
0
 def http_error_302(self, req, fp, code, msg, headers):
     result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
     result.status = code
     return result
 def http_error_302(self, req, fp, code, msg, headers):
     result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
     result.status = code
     return result
Example #39
0
 def redirect_request(self, req, *rest):
     resource = req.resource
     nreq = HTTPRedirectHandler.redirect_request(self, req, *rest)
     nreq.resource = resource
     return nreq