Ejemplo n.º 1
0
    def gtaskManager(self,urls,extractSearchResults,proxy_flag = 0,ua_flag = 0):
        task_log = None 
        gtaskpool.setlogging(logging.INFO,task_log)
        purl1 = ["http://192.168.120.17:8014/proxy/get_http_proxy_list"]
        uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list"
        limited_urls = [
            ('^https{0,1}://', 0)
        ]
        global proxymgr

        if proxy_flag == 1:
            proxymgr = ProxyManager(get_http_proxies, limited_urls,
                                {'refresh': True, 'interval': 30 * 60, 'delay': 8 * 60}, *purl1)
        else:
            proxymgr = None
        print proxymgr
        global useragents
        if ua_flag == 1:
            useragents = get_useragents(uurl1)
        else:
            useragents = [None]
        if useragents == []:
            useragents = [None]

        gtaskpool.runtasks(AccessUrls.taskGenerator(self,urls,extractSearchResults))
Ejemplo n.º 2
0
    def gtaskmanager(self,engine_type):
        #task_log = 'task_log.log'
        task_log = None
        gtaskpool.setlogging(logging.INFO,task_log)
        purl1 = ["http://192.168.120.185:5500/get_google_http_proxy_list"]
        uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list"
        limited_urls = [
            ('^https://search\.disconnect\.me', 1)
        ]
        global proxymgr

        proxymgr = ProxyManager(get_http_proxies, limited_urls,
                                {'refresh': True, 'interval': 30 * 60, 'delay': 8 * 60}, *purl1)
        global useragents
        useragents = get_useragents(uurl1)
        if useragents == []:
            useragents = [None]

        gtaskpool.runtasks(self.task_generator(self,engine_type))
Ejemplo n.º 3
0
    def gtaskmanager(self, engine_type):
        #task_log = 'task_log.log'
        task_log = None
        gtaskpool.setlogging(logging.INFO, task_log)
        purl1 = ["http://192.168.120.185:5500/get_google_http_proxy_list"]
        uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list"
        limited_urls = [('^https://search\.disconnect\.me', 1)]
        global proxymgr

        proxymgr = ProxyManager(get_http_proxies, limited_urls, {
            'refresh': True,
            'interval': 30 * 60,
            'delay': 8 * 60
        }, *purl1)
        global useragents
        useragents = get_useragents(uurl1)
        if useragents == []:
            useragents = [None]

        gtaskpool.runtasks(self.task_generator(self, engine_type))
Ejemplo n.º 4
0
            "task(%s) - response(len=%s): %s\n"
            % (url, len(r.text), rtext[: min(400, len(rtext))].replace("\r", "").replace("\n", ""))
        )
        fresult.flush()
        return True


def task_generator():
    base_url = "http://www.baidu.com/s?wd=apple&pn="
    task_num = 300
    for i in xrange(task_num):
        yield gtaskpool.Task(task_retry, [base_url + str(i * 10)])


if __name__ == "__main__":
    gtaskpool.setlogging(logging.INFO)

    purl1 = ["http://192.168.120.17:8014/proxy/get_http_proxy_list"]
    purl2 = ["http://192.168.1.14:5500/get_http_proxy_list"]
    uurl1 = "http://192.168.120.17:8014/proxy/get_useragent_list"
    uurl2 = "http://192.168.1.14:5500/get_useragent_list"

    # Create a ProxyManager if you need
    limited_urls = [("^http://www\.baidu\.com/s\?wd=apple&pn=\d+$", 1)]
    proxymgr = ProxyManager(
        get_http_proxies, limited_urls, {"refresh": True, "interval": 30 * 60, "delay": 8 * 60}, *purl2
    )
    # Or if you don't want refresh proxies periodcally
    # proxymgr = ProxyManager(get_http_proxies, *purl2, limited_urls, \
    #        {'refresh': False}, *purl2)
Ejemplo n.º 5
0
#!/usr/bin/env python
# encoding: utf-8

import gtaskpool

import requests

import logging


def task(n1, n2):
    logging.info("task(%s, %s): called", n1, n2)
    r = requests.get("http://www.baidu.com")
    print "task(%s, %s): response (len=%s): %s..." % \
            (n1, n2, len(r.text), r.text[:min(100, len(r.text))])
    logging.info("task(%s, %s): finished", n1, n2)


def task_generator():
    task_num = 10
    for i in xrange(1, task_num + 1):
        yield gtaskpool.Task(task, [i, i])


if __name__ == "__main__":
    gtaskpool.setlogging(logging.INFO)
    gtaskpool.runtasks(task_generator())