Example #1
0
def bookmarklet_follow(request):
    uri = request.GET["u"]
    title = request.GET["t"]

    # url opener
    redirect_handler = HTTPRedirectHandler()
    redirect_handler.max_redirections = settings.CONFIG["url"]["max_redirections"]
    opener = urllib2.build_opener(redirect_handler)

    # web page loading
    handle = opener.open(uri)
    encoding = detect_header_encoding(handle.headers.dict)
    content = decode_html(handle.read(), encoding)
    handle.close()

    # build a resource
    discovered_resource = DiscoveredResource()
    discovered_resource.uri = uri
    discovered_resource.title = title
    discovered_resource.content = content

    # process the discovered resource
    resource = processing_service_client.process(discovered_resource)

    topic = topic_manager.create_from_features(resource.title, resource.terms, resource.entities)

    return redirect("topic_tracking_web.demo.views.topics_show", topic._id)
Example #2
0
    def build_opener(self, debug=False):
        """Create handlers with the appropriate debug level.  
        We intentionally create new ones because the OpenerDirector 
        class in urllib2 is smart enough to replace its internal 
        versions with ours if we pass them into the 
        urllib2.build_opener method.  This is much easier than 
        trying to introspect into the OpenerDirector to find the 
        existing handlers.
        Based on http://code.activestate.com/recipes/440574/#c1

        TODO: Implement workaround for http://bugs.python.org/issue7152
        """
        http_handler = HTTPHandler(debuglevel=debug)
        https_handler = HTTPSHandler(debuglevel=debug)
        proxy_handler = ProxyHandler(debuglevel=debug)
        unknown_handler = UnknownHandler(debuglevel=debug)
        http_default_error_handler = HTTPDefaultErrorHandler(debuglevel=debug)
        http_redirect_handler = HTTPRedirectHandler(debuglevel=debug)
        http_error_processor = HTTPErrorProcessor(debuglevel=debug)

        handlers = [http_handler, https_handler, proxy_handler, \
                    unknown_handler, http_default_error_handler, \
                    http_redirect_handler, http_error_processor]
        opener = build_opener(handlers)

        return opener
Example #3
0
    def redirect_request(self, req, fp, code, msg, headers, newurl):
        """Return a Request or None in response to a redirect.

        See `urllib2.HTTPRedirectHandler`.

        If the original request is a POST request, the request's payload
        will be preserved in the redirect and the returned request will
        also be a POST request.
        """
        # If we can't handle this redirect,
        # HTTPRedirectHandler.redirect_request() will raise an
        # HTTPError. We call the superclass here in the old fashion
        # since HTTPRedirectHandler isn't a new-style class.
        new_request = HTTPRedirectHandler.redirect_request(
            self, req, fp, code, msg, headers, newurl)

        # If the old request is a POST request, the payload will be
        # preserved. Note that we don't need to test for the POST-ness
        # of the old request; if its data attribute - its payload - is
        # not None it's a POST request, if it's None it's a GET request.
        # We can therefore just copy the data from the old request to
        # the new without worrying about breaking things.
        new_request.data = req.data
        new_request.timeout = req.timeout
        return new_request
Example #4
0
def build_opener(source_address=None, timeout=10):
    """Function similar to ``urllib2.build_opener`` that will build
    an ``OpenerDirector`` with the explicit handlers we want,
    ``source_address`` for binding, ``timeout`` and our custom
    `User-Agent`
    """

    # printer('Timeout set to %d' % timeout, debug=True)

    if source_address:
        source_address_tuple = (source_address, 0)
        # printer('Binding to source address: %r' % (source_address_tuple,), debug=True)
    else:
        source_address_tuple = None

    handlers = [
        ProxyHandler(),
        SpeedtestHTTPHandler(source_address=source_address_tuple,
                             timeout=timeout),
        SpeedtestHTTPSHandler(source_address=source_address_tuple,
                              timeout=timeout),
        HTTPDefaultErrorHandler(),
        HTTPRedirectHandler(),
        HTTPErrorProcessor()
    ]

    opener = OpenerDirector()
    opener.addheaders = [('User-agent', build_user_agent())]

    for handler in handlers:
        opener.add_handler(handler)

    return opener
Example #5
0
    def redirect_request(self, req, fp, code, msg, headers, newurl):
        """Return a Request or None in response to a redirect.

        See `urllib2.HTTPRedirectHandler`.

        If the original request is a POST request, the request's payload
        will be preserved in the redirect and the returned request will
        also be a POST request.
        """
        # If we can't handle this redirect,
        # HTTPRedirectHandler.redirect_request() will raise an
        # HTTPError. We call the superclass here in the old fashion
        # since HTTPRedirectHandler isn't a new-style class.
        new_request = HTTPRedirectHandler.redirect_request(
            self, req, fp, code, msg, headers, newurl)

        # If the old request is a POST request, the payload will be
        # preserved. Note that we don't need to test for the POST-ness
        # of the old request; if its data attribute - its payload - is
        # not None it's a POST request, if it's None it's a GET request.
        # We can therefore just copy the data from the old request to
        # the new without worrying about breaking things.
        new_request.data = req.data
        new_request.timeout = req.timeout
        return new_request
Example #6
0
 def http_error_301(self, req, fp, code, msg, headers):
     if headers.has_key('location'):
         self.__set_permanent(req, headers['location'])
     elif headers.has_key('uri'):
         self.__set_permanent(req, headers['uri'])
     return HTTPRedirectHandler.http_error_301(self, req, fp, code, msg,
                                               headers)
Example #7
0
 def redirect_request(self, req, fp, code, msg, hdrs, newurl):
     self._counter += 1
     if (self._counter > self._maxRedirects):
         raise HTTPError(req.get_full_url(), code,
                         'Reached the maximum number of redirects', hdrs,
                         fp)
     else:
         # TODO: really reuse referer-header?
         return HTTPRedirectHandler.redirect_request(
             self, req, fp, code, msg, hdrs, newurl)
Example #8
0
    def http_error_302(self, req, fp, code, msg, headers):
        if 'location' in headers:
            newurl = headers.getheaders('location')[0]
        elif 'uri' in headers:
            newurl = headers.getheaders('uri')[0]
        newurl = urljoin(req.get_full_url(), newurl)
        log.debug("302 %s" % newurl)

        result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)

        return result
Example #9
0
    def open(self, url, start, headers=None):
        time_spent = int(time() - start)
        if time_spent >= TIMEOUT:
            raise TimeoutError(time_spent)

        if not headers:
            headers = dict()
        headers.update({
            'User-Agent':
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; ' +
            'rv:1.9.2.13) Gecko/20101203 Firefox/3.6.13'
        })

        opener = build_opener(HTTPRedirectHandler(), HTTPCookieProcessor())
        return opener.open(Request(url, headers=headers),
                           timeout=min(CONNECTION_TIMEOUT,
                                       TIMEOUT - time_spent))
Example #10
0
 def redirect_request(self, req, fp, code, msg, hdrs, newurl):
     if newurl.startswith('https://localhost.admin.eutaxia.eu:5000/login/%s' % provider):
         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
     return HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, hdrs, newurl)
Example #11
0
 def http_error_302(self, req, fp, code, msg, headers):
     result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
     result.status = code
     #log.debug("%d %s" % (code, req.get_full_url()))
     return result 
Example #12
0
 def http_error_301(self, req, fp, code, msg, headers):
     response = \
         HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
     response.status = code
     return response
Example #13
0
 def http_error_302(self, req, fp, code, msg, headers):
     results = HTTPRedirectHandler.http_error_302(self, req, fp, code,
                     msg, headers)
     results.status = code
     return result
Example #14
0
 def http_error_302(self, req, fp, code, msg, headers):
     result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg,
                                                 headers)
     result.status = code
     #log.debug("%d %s" % (code, req.get_full_url()))
     return result
Example #15
0
 def http_error_302(self, req, fp, code, msg, headers):
     return HTTPRedirectHandler.http_error_302(self, req, fp, code,
                                               msg, headers)
    def redirect_request(self, *args):
        new_request = HTTPRedirectHandler.redirect_request(self, *args)
        # We need to add a cookie from the cookie_jar
        self.cookie_jar.add_cookie_header(new_request)

        return new_request
            pass

        # remove resource from collection
        self._resources_collection.remove_model(resource)


if __name__ == '__main__':

    # configuration file
    config_file = sys.argv[1]
    config = yaml.load(file(config_file, 'r'))

    # logging
    logging.config.dictConfig(config['logging'])
    logger = logging.getLogger()

    # MongoDB
    mcm = mongo_from_config(config['mongo'])
    database = config['mongo']['databases']['discovery']
    resources_collection = mcm.get_collection(database, 'resources', DiscoveredResource)

    # url opener
    redirect_handler = HTTPRedirectHandler()
    redirect_handler.max_redirections = config['url']['max_redirections']
    opener = urllib2.build_opener(redirect_handler)

    # load pages to the queue
    loader = WebPageLoader(resources_collection, opener, config)
    loader.start()

Example #18
0
 def http_error_302(self, req, fp, code, msg, headers):
     """ Handle a 302 error """
     result = HTTPRedirectHandler.http_error_302(
         self, req, fp, code, msg, headers)
     result.status = code
     return result
 def http_error_302(self, req, fp, code, msg, headers):
     results = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg,
                                                  headers)
     results.status = code
     return result
Example #20
0
 def redirect_request(self, req, fp, code, msg, headers, newurl):
     new_req = HTTPRedirectHandler.redirect_request(self, req, fp, code,
                                                    msg, headers, newurl)
     req.redirect_code = code
     return new_req
Example #21
0
import base64
from datetime import datetime
import pickle
from robotparser import RobotFileParser
import time
from urlparse import urlparse
from urllib2 import urlopen, Request, HTTPError, install_opener, build_opener, HTTPRedirectHandler

from django.core.cache import cache

from couchdb.mapping import Document, TextField, DateTimeField, ListField, FloatField

import settings

install_opener(build_opener(HTTPRedirectHandler()))


class Page(Document):
    type = TextField(default="page")

    url = TextField()

    content = TextField()

    links = ListField(TextField())

    rank = FloatField(default=0)

    last_checked = DateTimeField(default=datetime.now)

    def is_valid(self):
Example #22
0
 def http_error_302(self, req, fp, code, msg, headers):
     """ Handle a 302 error """
     result = HTTPRedirectHandler.http_error_302(self, req, fp, code, msg,
                                                 headers)
     result.status = code
     return result
Example #23
0
 def http_error_302(self, req, fp, code, msg, headers):
     return HTTPRedirectHandler.http_error_302(self, req, fp, code, msg,
                                               headers)