Example #1
0
    def startRequest(self, request, url, feed_config = None, selector_defer=None, sanitize=False):
        downloader = self.downloadercls(self.feed, self.debug, self.snapshot_dir, self.stat_tool, self.memon,
                                        request=request, url=url, feed_config=feed_config,
                                        selector_defer=selector_defer, sanitize=sanitize, max_size=self.max_size)

        sresponse = self.tryLocalPage(url)
        if sresponse:
            if selector_defer:
                reactor.callLater(0, selector_defer.callback, sresponse)
            else:
                downloader.writeResponse(request, sresponse, feed_config)
        else:
            agent = BrowserLikeRedirectAgent(
                Agent(reactor,
                    contextFactory=ScrapyClientContextFactory(), # skip certificate verification
                    connectTimeout=10),
                    #pool=pool),
                redirectLimit=5
            )

            d = agent.request(
                'GET',
                url,
                twisted_headers({
                    'Accept': ['text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'],
                    'Accept-Encoding': ['gzip, deflate, sdch'],
                    'User-Agent': [self.user_agent]
                }),
                None
            )
            print('Request <GET %s> started' % (url,))
            d.addCallback(downloader.downloadStarted)
            d.addErrback(downloader.downloadError)
 def getContext(self, hostname=None, port=None):
     self.method = SSL.SSLv23_METHOD
     ctx = ScrapyClientContextFactory.getContext(self)
     ctx.set_options(SSL.OP_ALL)
     if hostname:
         ClientTLSOptions(hostname, ctx)
     return ctx
Example #3
0
 def testPayloadDefaultCiphers(self):
     s = "0123456789" * 10
     d = getPage(
         self.getURL("payload"),
         body=s,
         contextFactory=ScrapyClientContextFactory())
     return self.assertFailure(d, OpenSSL.SSL.Error)
 def getContext(self, hostname=None, port=None):
     ctx = ScrapyClientContextFactory.getContext(self)
     # Enable all workarounds to SSL bugs as documented by
     # http://www.openssl.org/docs/ssl/SSL_CTX_set_options.html
     ctx.set_options(SSL.OP_ALL)
     if hostname:
         ClientTLSOptions(hostname, ctx)
     return ctx
Example #5
0
        r.set(url, int(time.time()))
    return 0

GC_PERIOD_SECONDS = 3 * 60 * 60 # 3 hours

def periodical_garbage_collect():
    tm = int(time.time())
    if tm - periodical_garbage_collect.time >= GC_PERIOD_SECONDS:
        print('GC: the number of unreachable objects: %s' % gc.collect())
        periodical_garbage_collect.time = tm

periodical_garbage_collect.time = int(time.time())

agent = BrowserLikeRedirectAgent(
            Agent(reactor,
                contextFactory=ScrapyClientContextFactory(), # skip certificate verification
                connectTimeout=10),
            redirectLimit=5
        )

def html2json(el):
    return [
        el.tag,
        {"tag-id": el.attrib["tag-id"]},
        [html2json(e) for e in el.getchildren() if isinstance(e, etree.ElementBase)]
    ]

def setBaseAndRemoveScriptsAndMore(response, url):
    response.selector.remove_namespaces()

    tree = response.selector.root.getroottree()
Example #6
0
 def __init__(self):
     ScrapyClientContextFactory.__init__(self)
     self.method = SSL.TLSv1_2_METHOD