Beispiel #1
0
    def run(self):
        sleep(10)
        self.last_mean = .015
        self.q = RedisQueue('test')
        print('start')
        self.conn = sqlite3.connect("data.db")
        while not self.q.empty():
            features = str(self.q.get())[3:-2].replace("'","").split(', ')
            self.features = list(features)
            for self.hold_time in ['_10']:
                df = self.df[self.features+['stock_perc_change'+self.hold_time, 'abnormal_perc_change'+self.hold_time]]
                targets = [self.df['stock_perc_change'+self.hold_time], self.df['abnormal_perc_change'+self.hold_time]]
                positive_dfs = []
                negative_dfs = []
                for i in range(8):
                    a_train, a_test, b_train, b_test = train_test_split(df.ix[:,:-2], df.ix[:,-2:], test_size=.4)

                    self.train(a_train, b_train)
                    test_result, negative_df, positive_df = self.test(a_test, b_test)
                    if test_result:
                        positive_dfs.append(positive_df)
                        negative_dfs.append(negative_df)
                    else:
                        break

                if test_result:
                    self.get_result(pd.concat(positive_dfs), pd.concat(negative_dfs))
Beispiel #2
0
class Chunker(object):
    def __init__(self, redis_host):
        self.work_queue = RedisQueue(redis_host, "inqueue")

    def run(self):
        chunk_id = 0
        a_range = xrange(1,10) + xrange(10,256)
        for a in shuffle(a_range):
            for b in shuffle(xrange(1, 255)):
                if a == 172 and b in xrange(16,32):
                    continue
                if a == 192 and b == 168:
                    continue
                for c in shuffle(xrange(1, 255)):
                    ip_range = "{0}.{1}.{2}.0/24".format(a, b, c)
                    print "Sending chunk {0} range: {1}".format(chunk_id,
                            ip_range)
                    task = {
                            "range": ip_range,
                            "id": chunk_id
                           }
                    self.work_queue.put(task)
                    chunk_id += 1
                    sleep(10)

    def run_test(self):
        self.work_queue.put({"range": "129.21.50.0/24", "id":0})
        self.work_queue.put({"range": "129.21.49.0/24", "id":1})
Beispiel #3
0
class TCQ():
    '''A class of tc redis queue'''
    def __init__(self, _obj, _q_type=None):
        self._obj = _obj
        self._q_type = _q_type  # crawler type
        self.tc_type = Config.TC_TYPE  # queue type
        # DB
        self.redisQueue = RedisQueue()  # redis queue

        # message
        self.message = Message()

        # queue key
        if self._q_type:
            self._key = '%s_%s_%s' % (self.tc_type, self._obj, self._q_type)
        else:
            self._key = '%s_%s' % (self.tc_type, self._obj)

    # clear queue
    def clearQ(self):
        self.redisQueue.clear_q(self._key)

    # 写入redis queue
    def putQ(self, _msg):
        self.redisQueue.put_q(self._key, _msg)

    # 转换msg
    def putlistQ(self, item_list):
        for _item in item_list:
            _val = (0, self._obj, self._q_type) + _item
            msg = self.message.QueueMsg(self._obj, _val)
            if msg:
                self.putQ(msg)
Beispiel #4
0
class JHSQ():
    '''A class of jhs redis queue'''
    def __init__(self, _obj, _q_type=None):
        self._obj       = _obj
        self._q_type    = _q_type           # queue type
        self.jhs_type   = Config.JHS_TYPE   # queue type
        # DB
        self.redisQueue  = RedisQueue()      # redis queue

        # message
        self.message     = Message()

        # queue key
        if self._q_type:
            self._key    = '%s_%s_%s' % (self.jhs_type, self._obj, self._q_type)
        else:
            self._key    = '%s_%s' % (self.jhs_type, self._obj)

    # clear queue
    def clearQ(self):
        self.redisQueue.clear_q(self._key)

    # 写入redis queue
    def putQ(self, _msg):
        self.redisQueue.put_q(self._key, _msg)

    # 转换msg
    def putlistQ(self, item_list):
        for _item in item_list:
            _val = (0,self._obj,self.jhs_type) + _item
            msg = self.message.jhsQueueMsg(self._obj, _val)
            if msg:
                self.putQ(msg)
Beispiel #5
0
 def __init__(self, redis_host, es_urls):
     self.pages_queue = RedisQueue(redis_host, "pagesqueue") # take pages out of this queue
     self.links_queue = RedisQueue(redis_host, "linksqueue") # put links into this queue
     self.connection = pyelasticsearch.ElasticSearch(es_urls)
     try:
         self.connection.create_index("webpages")
     except:
         pass
Beispiel #6
0
    def __init__(self, in_queue_namespace, out_queue_namespace):
        
        self.in_queue_namespace = in_queue_namespace
        self.out_queue_namespace = out_queue_namespace

        self.in_queue = RedisQueue(in_queue_namespace)
        self.out_queue = RedisQueue(out_queue_namespace)

        print "Parser worker loaded"
Beispiel #7
0
def start_boss_task():
    pos_lst = ['JAVA', 'C', 'Python', 'PHP', 'IOS', 'Android']
    url = ['https://www.zhipin.com/c101010100-p100104/?page={page}&ka=page-{page}'.format(page=str(i + 1)) for i in
           xrange(2)]
    for p in pos_lst:
        url += ['https://www.zhipin.com/c101010100/h_101010100/?query={pos}&page={page}&ka=page-{page}'.format(
            page=str(i + 1), pos=p) for i in xrange(2)]

    rq = RedisQueue()
    rq.push_task('boss_root', url, level=2)
Beispiel #8
0
    def __init__(self, in_queue_namespace, out_queue_namespace, apikey):

        self.in_queue_namespace = in_queue_namespace
        self.out_queue_namespace = out_queue_namespace
        self.apikey = apikey

        self.in_queue = RedisQueue(in_queue_namespace)
        self.out_queue = RedisQueue(out_queue_namespace)

        print "Fetcher loaded with apikey", self.apikey
class RedisMessageProvider(MessageProvider):
    def __init__(self, host, port, queue_name):
        self.queue = RedisQueue(name=queue_name,
                                namespace='queue',
                                host=host,
                                port=port)
        self.queue.wait_for()

    def get_message(self):
        return self.queue.get()
    def __init__(self, in_queue_namespace, out_queue_namespace, apikey):

        self.in_queue_namespace = in_queue_namespace
        self.out_queue_namespace = out_queue_namespace
        self.apikey = apikey

        self.in_queue = RedisQueue(in_queue_namespace)
        self.out_queue = RedisQueue(out_queue_namespace)

        print "Fetcher loaded with apikey", self.apikey
class QueueManage(object):

    def __init__(self, name):
        self.q_obj = RedisQueue(name)

    def get_queue_data(self):
        q_re = self.q_obj.get_all()
        return q_re

    def queue_size(self):
        return self.q_obj.qsize()
Beispiel #12
0
 def getQueue(self, ipaddr, port, name, namespace="queues", fromcache=True):
     if not fromcache:
         return RedisQueue(self.get(ipaddr, port, fromcache=False),
                           name,
                           namespace=namespace)
     key = "%s_%s_%s_%s" % (ipaddr, port, name, namespace)
     if key not in self._redisq:
         self._redisq[key] = RedisQueue(self.get(ipaddr, port),
                                        name,
                                        namespace=namespace)
     return self._redisq[key]
Beispiel #13
0
def main():

    done_que = RedisQueue('seed')
    run_que = RedisQueue('run')
    # workder_download
    seeds = ['http://www.2345.com']
    workder_download = Worker(seeds, done_que, run_que)

    try:
        workder_download.work()
    except KeyboardInterrupt:
        print "Ctrl+C"
        if workder_download.debugnosave == 0:
            workder_download.savestate()
Beispiel #14
0
def dump_traffic():
    global packets_dump
    global redis_packet_queue
    global redis_results_queue

    redis_packet_queue = RedisQueue('packet_worker_queue')
    redis_results_queue = RedisQueue('packet_results_queue')

    print('[*] Packet dumping thread is now online')
    while True:
        ts = time.time()
        date = dt.datetime.fromtimestamp(ts).strftime('%d-%m-%Y_%H:%M:%S')
        save_as_csv('./packet_dump{}.csv'.format(date), ['Status code', 'Method', 'Version', 'Scheme', 'Request Length', 'Response Length', 'Request Entropy', 'Response Entropy', 'Client Connection', 'Server Connection'], packets_dump)
        time.sleep(THREAD_SLEEP_TIME)
Beispiel #15
0
class Crawler(object):
    def __init__(self, redis_host, depth=10):
        self.links_queue = RedisQueue(redis_host, "linksqueue")
        self.pages_queue = RedisQueue(redis_host, "pagesqueue")

    def run(self):
        while True:
            link = self.links_queue.get().data
            try:
                page = WebPage(requests.get(link).text, link, 80)
            except:
                print("Exception GETing {0}".format(link))
                continue
            self.pages_queue.put(page.to_dict())
Beispiel #16
0
    def __init__(self, _obj, _q_type=None):
        self._obj = _obj
        self._q_type = _q_type  # crawler type
        self.tc_type = Config.TC_TYPE  # queue type
        # DB
        self.redisQueue = RedisQueue()  # redis queue

        # message
        self.message = Message()

        # queue key
        if self._q_type:
            self._key = '%s_%s_%s' % (self.tc_type, self._obj, self._q_type)
        else:
            self._key = '%s_%s' % (self.tc_type, self._obj)
Beispiel #17
0
    def __init__(self):
        # jhs brand type
        self.worker_type    = Config.JHS_Brand
        # DB
        self.jhs_type       = Config.JHS_TYPE   # queue type
        self.mysqlAccess    = MysqlAccess()     # mysql access
        self.redisQueue     = RedisQueue()      # redis queue
        self.redisAccess    = RedisAccess()     # redis db
        self.mongofsAccess  = MongofsAccess()   # mongodb fs access

        # 获取Json数据
        self.jsonpage       = Jsonpage()

        # 抓取设置
        self.crawler        = TBCrawler()

        # 页面模板解析
        self.brand_temp     = JHSBrandTEMP()

        # message
        self.message        = Message()

        # 抓取时间设定
        self.crawling_time  = Common.now() # 当前爬取时间
        self.begin_time     = Common.now()
        self.begin_date     = Common.today_s()
        self.begin_hour     = Common.nowhour_s()
Beispiel #18
0
    def __init__(self):
        # jhs group item type
        self.worker_type    = Config.JHS_GroupItem

        self.jhs_type       = Config.JHS_TYPE   # queue type

        # message
        self.message        = Message()

        # 获取Json数据
        self.jsonpage       = Jsonpage()

        # 抓取设置
        self.crawler        = TBCrawler()

        # 抓取时间设定
        self.crawling_time  = Common.now() # 当前爬取时间
        self.begin_time     = Common.now()
        self.begin_date     = Common.today_s()
        self.begin_hour     = Common.nowhour_s()

        # DB
        # mysql access
        self.mysqlAccess    = MysqlAccess()

        # redis queue
        self.redisQueue     = RedisQueue()

        # redis access
        self.redisAccess    = RedisAccess()

        # mongodb fs access
        self.mongofsAccess  = MongofsAccess()
Beispiel #19
0
    def process_request_origin(self, request, spider):
        redis = RedisQueue('proxy_ip')
        if not redis.empty():
            proxy_ip = redis.get()
        else:
            proxy_ip = get_ip()

        proxy_para = {
                'ip_port': proxy_ip,
                'user_pass': ''
            }
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para['ip_port']
        redis.put(proxy_ip)
Beispiel #20
0
class Receiver(object):
    def __init__(self, redis_host):
        self.output_queue = RedisQueue(redis_host, "outqueue")

    def run(self):
        while True:
            result = self.output_queue.get().data
            pprint(result)
            print "---"

    def run_dump(self):
        dumpfile = open("netcrawl.log", "w")
        while True:
            result = self.output_queue.get().data
            pprint(result)
            dumpfile.write(pformat(result) + "\n")
            dumpfile.flush()
Beispiel #21
0
 def getRedisQueue(self,
                   ipaddr,
                   port,
                   name,
                   namespace="queues",
                   fromcache=True):
     if not fromcache:
         return RedisQueue(self.getRedisClient(ipaddr,
                                               port,
                                               fromcache=False),
                           name,
                           namespace=namespace)
     key = "%s_%s_%s_%s" % (ipaddr, port, name, namespace)
     if not self.redisq.has_key(key):
         self.redisq[key] = RedisQueue(self.getRedisClient(ipaddr, port),
                                       name,
                                       namespace=namespace)
     return self.redisq[key]
Beispiel #22
0
 def getGeventRedisQueue(self,
                         ipaddr,
                         port,
                         name,
                         namespace="queues",
                         fromcache=False):
     fromcache = False  # @todo remove
     if not fromcache:
         return RedisQueue(self.getGeventRedisClient(ipaddr, port, False),
                           name,
                           namespace=namespace)
     key = "%s_%s_%s_%s" % (ipaddr, port, name, namespace)
     if not self.gredisq.has_key(key):
         self.gredisq[key] = RedisQueue(self.getGeventRedisClient(
             ipaddr, port),
                                        name,
                                        namespace=namespace)
     return self.gredisq[key]
Beispiel #23
0
class Indexer(object):
    def __init__(self, redis_host, es_urls):
        self.pages_queue = RedisQueue(redis_host, "pagesqueue") # take pages out of this queue
        self.links_queue = RedisQueue(redis_host, "linksqueue") # put links into this queue
        self.connection = pyelasticsearch.ElasticSearch(es_urls)
        try:
            self.connection.create_index("webpages")
        except:
            pass

    def run(self):
        while True:
            result = self.pages_queue.get().data
            result['tags'] = genTags(result['html'])
            self.connection.index('webpages', 'webpage', result, id=result['ip'])
            print('Indexed {0}'.format(result['ip']))
            for link in result['links']:
                self.links_queue.put(link)
Beispiel #24
0
class StudyscrapyPipeline(object):
    def __init__(self):
        self.q = RedisQueue(name='CSDN', host='localhost', port=6379, db=3)
        if redis_db.hlen(redis_data_dict) == 0:
            pass

    def process_item(self, item, spider):
        # fp = open(r'F:\Spider\Spider\studyscrapy\out.txt', 'a+')
        if redis_db.hexists(redis_data_dict, item['title']):
            print('数据已存入队列 <--')
            pass
        else:
            # fp.write(item['title']+', '+item['time']+'\n')
            self.q.put(item['title'] + ':' + item['time'])
            redis_db.hset(redis_data_dict, item['title'], item['time'])
            print('title: {0},time: {1} 存入队列成功'.format(item['title'],
                                                       item['time']))

        return item
Beispiel #25
0
def index():
    stats = {
        'currentTemp': 0,
        'currentHumidity': 0,
        'lastUpdateTime': "never",
        'message': "Not initialized",
        'targetTemp': 0,
        'brightness': 0
    }
    form = UpdateForm()
    q = RedisQueue('brooder')
    dataPoints = []
    for item in q.getall():
        dataPoints.append(json.loads(str(item, 'utf-8')))

    if len(dataPoints) > 0:
        stats = dataPoints[-1]

    brooderConfig = json.load(open(brooderConfigFile))
    return render_template('main.html', **locals())
Beispiel #26
0
    def __init__(self):
        # tc spot type
        self.worker_type   = Config.TC_Spot
        # DB
        self.tc_type       = Config.TC_TYPE    # queue type
        self.mysqlAccess   = MysqlAccess()     # mysql access
        self.redisQueue    = RedisQueue()      # redis queue
        self.mongofsAccess = MongofsAccess()   # mongodb fs access

        # 抓取设置
        self.crawler       = TCCrawler()

        # message
        self.message       = Message()

        # 抓取时间设定
        self.crawling_time = Common.now() # 当前爬取时间
        self.begin_time    = Common.now()
        self.begin_date    = Common.today_s()
        self.begin_hour    = Common.nowhour_s()
Beispiel #27
0
    def dispose_ip(self, proxy_ip, redis_label):
        redis_list = []
        for i in range(REDIS_NUM):
            redis_list.append(RedisQueue('proxy_ip_%d' %i))
        redis_invalid_ip = RedisQueue('invalid_ip')
        if redis_label == REDIS_NUM - 1:
            redis_invalid_ip.put(proxy_ip)
            redis_list[0].put(get_ip())
        else:
            redis_list[redis_label].remove(proxy_ip)
            redis_list[redis_label+1].put(proxy_ip)
            if redis_list[0].empty():
                redis_list[0].put(get_ip())

        new_redis_label = random.choice(range(REDIS_NUM))
        while redis_list[new_redis_label].empty():
            new_redis_label = random.choice(range(REDIS_NUM))
        new_proxy_ip = redis_list[new_redis_label].get()
        redis_list[new_redis_label].put(new_proxy_ip)
        return new_proxy_ip,new_redis_label
Beispiel #28
0
    def dispose_ip(self, proxy_ip, redis_label):
        redis_list = []
        for i in range(REDIS_NUM):
            redis_list.append(RedisQueue('proxy_ip_%d' % i))
        redis_invalid_ip = RedisQueue('invalid_ip')
        if redis_label == REDIS_NUM - 1:
            redis_invalid_ip.put(proxy_ip)
            redis_list[0].put(get_ip())
        else:
            redis_list[redis_label].remove(proxy_ip)
            redis_list[redis_label + 1].put(proxy_ip)
            if redis_list[0].empty():
                redis_list[0].put(get_ip())

        new_redis_label = random.choice(range(REDIS_NUM))
        while redis_list[new_redis_label].empty():
            new_redis_label = random.choice(range(REDIS_NUM))
        new_proxy_ip = redis_list[new_redis_label].get()
        redis_list[new_redis_label].put(new_proxy_ip)
        return new_proxy_ip, new_redis_label
Beispiel #29
0
 def select_ip(REDIS_NUM):
     redis_list = []
     for i in range(REDIS_NUM):
         redis_list.append(RedisQueue('proxy_ip_%d' % i))
     for each in redis_list:
         print each.key
     label = random.choice(range(REDIS_NUM))
     while redis_list[label].empty():
         label = random.choice(range(REDIS_NUM))
     proxy_ip = redis_list[label].get()
     redis_list[label].put(proxy_ip)
     return proxy_ip, label
Beispiel #30
0
def main():

    done_que = RedisQueue('seed')
    run_que = RedisQueue('run')

    run_que.flushdb()

    conn = sqlite3.connect('site_data.db')
    conn.execute(
        "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)"
    )

    spend = 0
    cnt = 0
    size = 0
    while True:

        data = cPickle.loads(done_que.get())
        st = time.time()
        urls = geturls(data['url'], data['content'])
        if len(urls) == 0:
            continue

        for url in urls:
            if url not in bfdone:
                run_que.put(url)

        gziphtml = sqlite3.Binary(gzip.zlib.compress(data['content']))
        size += len(gziphtml)
        conn.execute(
            "insert into mainpages (url,headers,content) values (?,?,?)",
            (data['url'], str(data['headers']), gziphtml))

        et = time.time()
        spend += (et - st)
        cnt += 1
        if cnt % 10 == 0:
            print "cost:", spend / cnt, cnt, done_que.qsize(
            ), size / 1024 / 1024
            conn.commit()
class FetcherWorker:

    def __init__(self, in_queue_namespace, out_queue_namespace, apikey):

        self.in_queue_namespace = in_queue_namespace
        self.out_queue_namespace = out_queue_namespace
        self.apikey = apikey

        self.in_queue = RedisQueue(in_queue_namespace)
        self.out_queue = RedisQueue(out_queue_namespace)

        print "Fetcher loaded with apikey", self.apikey


    def run(self):

        while 1:

            base_url = self.in_queue.get()

            if base_url == "None":
                # add end-of-queue markers for parsers
                self.out_queue.put("None") 

                # ends program
                break

            url = base_url + self.apikey 
            
            t1 = time.time()
            
            print "fetching try 1", url

            resp = urllib2.urlopen(url)
            if resp.code == 200: 
                text = resp.read()
                self.out_queue.put(text)
            else:
                print 'failed once', url
                time.sleep(10)
                print "fetching try 2", url
                resp = urllib2.urlopen(url)
                if resp.code == 200:
                    text = resp.read()
                    self.out_queue.put(text)

            print "done fetching"

            # make sure we don't use the same API key within 2 seconds
            t2 = time.time()
            if t2 - t1 < 2.0:
                time.sleep(2.0 - (t2 - t1))
Beispiel #32
0
    def __init__(self, _obj, _q_type=None):
        self._obj       = _obj
        self._q_type    = _q_type           # queue type
        self.jhs_type   = Config.JHS_TYPE   # queue type
        # DB
        self.redisQueue  = RedisQueue()      # redis queue

        # message
        self.message     = Message()

        # queue key
        if self._q_type:
            self._key    = '%s_%s_%s' % (self.jhs_type, self._obj, self._q_type)
        else:
            self._key    = '%s_%s' % (self.jhs_type, self._obj)
Beispiel #33
0
    def __init__(self, key, q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)
        # thread lock
        self.mutex          = threading.Lock()

        self.xc_type        = Config.XC_TYPE # xc type
        #self.item_type      = q_type        # item queue type

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisQueue     = RedisQueue()  # redis queue
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # xc queue type
        self.xc_queue_type  = q_type # new...
        self._key           = key   # redis queue key

        # appendix val
        self.a_val          = a_val

        # return items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []
def main(run_type, store_num):
    q = RedisQueue(store_num)
    if run_type == 'gen':
        for i in range(len(sectorPolygons1954.sectors)):
            s = generateSectors(sectorPolygons1954.sectors[i], i, q, store_num)
            print('starting thread ' + str(i))
            s.start()

    elif run_type == 'run':
        batch_size = 30
        uri = 'localhost:27017'
        client = pymongo.MongoClient(uri)
        db = client['streets']
        streets = db[store_num]
        streets.create_index([("latitude", pymongo.DESCENDING),
                              ("longitude", pymongo.DESCENDING)])
        for i in range(int(batch_size)):
            rg = request_getter(q, store_num)
            print('starting request thread ' + str(i))
            rg.start()

        while q.qsize():
            sleep(10)
            print(q.qsize())
Beispiel #35
0
class ParserWorker():

    def __init__(self, in_queue_namespace, out_queue_namespace):
        
        self.in_queue_namespace = in_queue_namespace
        self.out_queue_namespace = out_queue_namespace

        self.in_queue = RedisQueue(in_queue_namespace)
        self.out_queue = RedisQueue(out_queue_namespace)

        print "Parser worker loaded"

    def run(self):

        while 1:
            xml_text = self.in_queue.get()
            print "Received XML"
            if xml_text == "None":
                self.out_queue.put("None")
                break

            json_doc = DataParser.parse_get_state_stats_resp(xml_text)
            print "Made JSON"
            self.out_queue.put(json_doc)
 def __init__(self, url='http://weixin.sogou.com/weixin', key_word=None):
     self.url = url
     self.key_word = key_word
     self.headers = {
         'Accept':
         'text/html,application/xhtml+xml.application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'Accept-Encoding':
         'gzip, deflate',
         'Accept-Language':
         'zh-CN,zh;q=0.8,en,q-0.6,ja;q=0.4,zs-TW;q=0.2,mt;q=0.2',
         'Cache-Control':
         'max-age=0',
         'Connection':
         'keep-alive',
         'Host':
         'weixin.sogou.com',
         'Upgrade-Insecure-Requests':
         '1',
         'User-Agent':
         'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
     }
     self.session = Session()
     self.queue = RedisQueue()
     self.mysql = MySQL()
 def __init__(self, url='http://weixin.sogou.com/weixin', key_word=None):
     self.url = url
     self.key_word = key_word
     self.headers = {
         'Accept': 'text/html,application/xhtml+xml.application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
         'Accept-Encoding': 'gzip, deflate',
         'Accept-Language': 'zh-CN,zh;q=0.8,en,q-0.6,ja;q=0.4,zs-TW;q=0.2,mt;q=0.2',
         'Cache-Control': 'max-age=0',
         'Connection': 'keep-alive',
         'Host': 'weixin.sogou.com',
         'Upgrade-Insecure-Requests': '1',
         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
     }
     self.session = Session()
     self.queue = RedisQueue()
     self.mysql = MySQL()
Beispiel #38
0
class FetcherWorker:
    def __init__(self, in_queue_namespace, out_queue_namespace, apikey):

        self.in_queue_namespace = in_queue_namespace
        self.out_queue_namespace = out_queue_namespace
        self.apikey = apikey

        self.in_queue = RedisQueue(in_queue_namespace)
        self.out_queue = RedisQueue(out_queue_namespace)

        print "Fetcher loaded with apikey", self.apikey

    def run(self):

        while 1:

            base_url = self.in_queue.get()

            if base_url == "None":
                # add end-of-queue markers for parsers
                self.out_queue.put("None")

                # ends program
                break

            url = base_url + self.apikey

            t1 = time.time()

            print "fetching try 1", url

            resp = urllib2.urlopen(url)
            if resp.code == 200:
                text = resp.read()
                self.out_queue.put(text)
            else:
                print 'failed once', url
                time.sleep(10)
                print "fetching try 2", url
                resp = urllib2.urlopen(url)
                if resp.code == 200:
                    text = resp.read()
                    self.out_queue.put(text)

            print "done fetching"

            # make sure we don't use the same API key within 2 seconds
            t2 = time.time()
            if t2 - t1 < 2.0:
                time.sleep(2.0 - (t2 - t1))
Beispiel #39
0
    def process_exception(self, request, exception, spider):
        request_ip = request.meta['proxy']
        invalid_ip = request_ip.split('//')[1]
        redis = RedisQueue('proxy_ip')
        redis_invalid_ip = RedisQueue('invalid_ip')
        if not redis.empty():
            redis.remove(invalid_ip)
            redis_invalid_ip.put(invalid_ip)
            print '+++++++++++++++++++++++%s' %exception
            print '-----------------------removing ip from redis: %s' %invalid_ip

        new_ip = get_ip()
        proxy_para = {
            'ip_port': new_ip,
            'user_pass': ''
        }
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % (invalid_ip,proxy_para['ip_port'])
        redis.put(new_ip)
Beispiel #40
0
    def process_request(self, request, spider):
        # proxy_ip,redis_label = self.select_ip(REDIS_NUM)
        redis_list = []
        for i in range(REDIS_NUM):
            redis_list.append(RedisQueue('proxy_ip_%d' % i))
        redis_label = random.choice(range(REDIS_NUM))
        while redis_list[redis_label].empty():
            redis_label = random.choice(range(REDIS_NUM))
        proxy_ip = redis_list[redis_label].get()
        redis_list[redis_label].put(proxy_ip)

        proxy_para = {'ip_port': proxy_ip, 'user_pass': ''}
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        request.meta['redis_label'] = redis_label
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers[
                'Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para[
            'ip_port']
class DatastoreWriterWorker():
    def __init__(self, in_queue_namespace):

        self.in_queue_namespace = in_queue_namespace

        self.in_queue = RedisQueue(in_queue_namespace)

    def run(self):

        while 1:
            json_doc = self.in_queue.get()

            if json_doc == "None":
                break

            print "DatastoreWriterWorker got", json_doc
            print
            print "Write to KV store, Fluentd, and MySQL"
            print
            print
class DatastoreWriterWorker:
    def __init__(self, in_queue_namespace):

        self.in_queue_namespace = in_queue_namespace

        self.in_queue = RedisQueue(in_queue_namespace)

    def run(self):

        while 1:
            json_doc = self.in_queue.get()

            if json_doc == "None":
                break

            print "DatastoreWriterWorker got", json_doc
            print
            print "Write to KV store, Fluentd, and MySQL"
            print
            print
Beispiel #43
0
    def __init__(self):
        # tc spot type
        self.worker_type = Config.TC_Spot
        # DB
        self.tc_type = Config.TC_TYPE  # queue type
        self.mysqlAccess = MysqlAccess()  # mysql access
        self.redisQueue = RedisQueue()  # redis queue
        self.mongofsAccess = MongofsAccess()  # mongodb fs access

        # 抓取设置
        self.crawler = TCCrawler()

        # message
        self.message = Message()

        # 抓取时间设定
        self.crawling_time = Common.now()  # 当前爬取时间
        self.begin_time = Common.now()
        self.begin_date = Common.today_s()
        self.begin_hour = Common.nowhour_s()
Beispiel #44
0
    def process_request_origin(self, request, spider):
        redis = RedisQueue('proxy_ip')
        if not redis.empty():
            proxy_ip = redis.get()
        else:
            proxy_ip = get_ip()

        proxy_para = {'ip_port': proxy_ip, 'user_pass': ''}
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers[
                'Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para[
            'ip_port']
        redis.put(proxy_ip)
Beispiel #45
0
    def __init__(self, key, q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)
        # thread lock
        self.mutex          = threading.Lock()

        self.tc_type        = Config.TC_TYPE # tc type
        #self.item_type      = q_type        # item queue type

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisQueue     = RedisQueue()  # redis queue
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # tc queue type
        self.tc_queue_type  = q_type # new...
        self._key           = key   # redis queue key

        # appendix val
        self.a_val          = a_val

        # return items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []
Beispiel #46
0
    def __init__(self, itemtype, q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)
        # thread lock
        self.mutex          = threading.Lock()

        self.jhs_type       = Config.JHS_TYPE # jhs type
        self.item_type      = itemtype      # item type

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisQueue     = RedisQueue()  # redis queue
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # jhs queue type
        self.jhs_queue_type = q_type     # h:每小时
        self._key           = '%s_%s_%s' % (self.jhs_type,self.item_type,self.jhs_queue_type)

        # appendix val
        self.a_val          = a_val

        # activity items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []
Beispiel #47
0
# 
#
# store the website
# dbname with ip?
#   
# sitedata_2nd_
#
#
# 
# db = dbd['runque']
# db = dbd['extracturls']
# 

dbd = dict()
dbd['runque'] = 1
dbd['extracturls'] = 2

host = "127.0.0.1"
password = '******'
# first insert into the done_site.bin

rq = RedisQueue(name = 'extracturls', host=host, password=password, db=dbd['extracturls'])
rr = RedisQueue(name ='runque', host=host, password=password, db=dbd['runque'])


print rq.qsize()
print rr.qsize()
#exit(0)


    def __init__(self, in_queue_namespace):

        self.in_queue_namespace = in_queue_namespace

        self.in_queue = RedisQueue(in_queue_namespace)
Beispiel #49
0
    def process_exception(self, request, exception, spider):
        request_ip = request.meta['proxy']
        invalid_ip = request_ip.split('//')[1]
        redis = RedisQueue('proxy_ip')
        redis_invalid_ip = RedisQueue('invalid_ip')
        if not redis.empty():
            redis.remove(invalid_ip)
            redis_invalid_ip.put(invalid_ip)
            print '+++++++++++++++++++++++%s' % exception
            print '-----------------------removing ip from redis: %s' % invalid_ip

        new_ip = get_ip()
        proxy_para = {'ip_port': new_ip, 'user_pass': ''}
        request.meta['proxy'] = "http://%s" % proxy_para['ip_port']
        if proxy_para['user_pass'] is not None:
            encoded_user_pass = base64.encodestring(proxy_para['user_pass'])
            request.headers[
                'Proxy-Authorization'] = 'Basic ' + encoded_user_pass
        print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % (
            invalid_ip, proxy_para['ip_port'])
        redis.put(new_ip)
Beispiel #50
0
 def __init__(self, redis_host):
     self.work_queue = RedisQueue(redis_host, "inqueue")
Beispiel #51
0
class TCItemRedisM(MyThread):
    '''A class of tc Item redis queue'''
    def __init__(self, key, q_type, thread_num=10, a_val=None):
        # parent construct
        MyThread.__init__(self, thread_num)
        # thread lock
        self.mutex          = threading.Lock()

        self.tc_type        = Config.TC_TYPE # tc type
        #self.item_type      = q_type        # item queue type

        # db
        self.mysqlAccess    = MysqlAccess() # mysql access
        self.redisQueue     = RedisQueue()  # redis queue
        self.mongofsAccess  = MongofsAccess() # mongodb fs access

        # tc queue type
        self.tc_queue_type  = q_type # new...
        self._key           = key   # redis queue key

        # appendix val
        self.a_val          = a_val

        # return items
        self.items          = []

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._tag           = 'ikuai'
        #self._tag          = 'tpent'

        # give up item, retry too many times
        self.giveup_items   = []

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._tag))
        except Exception as e:
            Common.log('# To dial router exception: %s' % e)

    def push_back(self, L, v):
        if self.mutex.acquire(1):
            L.append(v)
            self.mutex.release()

    def crawlRetry(self, _key, msg):
        if not msg: return
        msg['retry'] += 1
        _retry = msg['retry']
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == 'item':
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(_key, msg)
        else:
            Common.log('# retry too many time, no get msg:')
            Common.log(msg)

    # insert item
    def insertIteminfo(self, iteminfosql_list, f=False):
        if f or len(iteminfosql_list) >= Config.item_max_arg:
            if len(iteminfosql_list) > 0:
                self.mysqlAccess.insertTCItem(iteminfosql_list)
            return True
        return False

    # item sql list
    def crawl(self):
        _iteminfosql_list = []
        i, M = 0, 2
        n = 0
        while True:
            try:
                _data = self.redisQueue.get_q(self._key)

                # 队列为空
                if not _data:
                    # 队列为空,退出
                    # info
                    self.insertIteminfo(_iteminfosql_list, True)
                    _iteminfosql_list = []

                    i += 1
                    if i > M:
                        Common.log('# all get itemQ item num: %d' % n)
                        Common.log('# not get itemQ of key: %s' % self._key)
                        break
                    time.sleep(10)
                    continue
                n += 1
                item = None
                obj = 'item'
                if self.tc_queue_type == 'spot':
                    # 商品实例
                    item = Item()
                    #_val = _data[1]
                    _val = _data["val"]
                    if self.a_val: _val = _val + self.a_val

                    item.antPage(_val)
                    # 汇聚
                    self.push_back(self.items, item.outSql())

                    # 入库
                    tickets = item.item_tickets
                    if tickets and len() > 0:
                        self.mysqlAccess.insertTCTicket(tickets)
                    iteminfoSql = item.outSql()
                    _iteminfosql_list.append(iteminfoSql)
                    if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = []
                else:
                    continue

                # 存网页
                #if item and obj != '':
                #    _pages = item.outItemPage(obj, self.tc_queue_type)
                #    self.mongofsAccess.insertTCPages(_pages)

                # 延时
                time.sleep(1)

            except Common.NoItemException as e:
                Common.log('# Not item exception: %s' % e)

            except Common.NoPageException as e:
                Common.log('# Not page exception: %s' % e)

            except Common.InvalidPageException as e:
                self.crawlRetry(self._key, _data)
                Common.log('# Invalid page exception: %s' % e)

            except Exception as e:
                Common.log('# Unknown exception crawl item: %s' % e)
                Common.traceback_log()

                self.crawlRetry(self._key, _data)
                if str(e).find('Read timed out') == -1:
                    # 重新拨号
                    try:
                        self.dialRouter(4, 'item')
                    except Exception as e:
                        Common.log('# DailClient Exception err: %s' % e)
                        time.sleep(10)
                    time.sleep(random.uniform(10,30))
Beispiel #52
0
from RedisQueue import RedisQueue
import sys
import random
from pymongo import MongoClient

if __name__ == '__main__':
    db = MongoClient()
    exists = db.zhihu.zhihu_answers
    exist_owners = []
    for e in exists.find():
        exist_owners.append(e['owner'])
    print(len(exist_owners))
    all_ids = [line.strip().split('\t')[0]
               for line in open('./user_followees.data')]
    candidates = list(set(all_ids) - set(exist_owners))
    queue = RedisQueue('answer_queue')
    queue.clear()
    print('Count: %d' % len(candidates))
    for c in candidates[0:]:
        queue.put(c)
#!/usr/bin/python
from RedisQueue import RedisQueue
import subprocess
import json
import base64

q = RedisQueue('messages',
               namespace='ansible',
               host='internal-redis.ovmdvp.0001.use2.cache.amazonaws.com',
               port=6379,
               db=1)

while True:
    res = q.get()
    message = json.loads(res)
    subprocess.Popen([
        "/home/ubuntu/ansible-bot/message_bridge/run_ansible_controller.sh",
        message['response_id'], message['playbook'],
        base64.b64encode(res)
    ])
Beispiel #54
0
class TCWorker:
    """A class of tc worker"""

    def __init__(self):
        # tc spot type
        self.worker_type = Config.TC_Spot
        # DB
        self.tc_type = Config.TC_TYPE  # queue type
        self.mysqlAccess = MysqlAccess()  # mysql access
        self.redisQueue = RedisQueue()  # redis queue
        self.mongofsAccess = MongofsAccess()  # mongodb fs access

        # 抓取设置
        self.crawler = TCCrawler()

        # message
        self.message = Message()

        # 抓取时间设定
        self.crawling_time = Common.now()  # 当前爬取时间
        self.begin_time = Common.now()
        self.begin_date = Common.today_s()
        self.begin_hour = Common.nowhour_s()

    def init_crawl(self, _obj, _crawl_type):
        self._obj = _obj
        self._crawl_type = _crawl_type

        # dial client
        self.dial_client = DialClient()

        # local ip
        self._ip = Common.local_ip()

        # router tag
        self._router_tag = "ikuai"
        # self._router_tag  = 'tpent'

        # items
        self.items = []

        # giveup items
        self.giveup_items = []

        # giveup msg val
        self.giveup_val = None
        self.init_log(_obj, _crawl_type)

    def init_log(self, _obj, _crawl_type):
        if not Logger.logger:
            loggername = "other"
            filename = "crawler_%s" % (time.strftime("%Y%m%d%H", time.localtime(self.begin_time)))
            if _obj == "channel":
                loggername = "channel"
                filename = "add_%s_%s" % (_crawl_type, time.strftime("%Y%m%d%H", time.localtime(self.begin_time)))
            # elif _obj == 'item':

            Logger.config_logging(loggername, filename)

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = "%s_%s" % (_type, _obj)
            self.dial_client.send((_module, self._ip, self._router_tag))
        except Exception as e:
            Common.log("# To dial router exception: %s" % e)

    # To crawl retry
    def crawlRetry(self, _key, msg):
        if not msg:
            return
        msg["retry"] += 1
        _retry = msg["retry"]
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == "channel":
            max_time = Config.channel_crawl_retry
        elif _obj == "item":
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(_key, msg)
        else:
            # self.push_back(self.giveup_items, msg)
            Common.log("# retry too many time, no get msg:")
            Common.log(msg)

    # To crawl page
    def crawlPage(self, _obj, _crawl_type, _key, msg, _val):
        try:
            if _obj == "channel":
                self.run_channel(msg)
            else:
                Common.log("# crawlPage unknown obj = %s" % _obj)
        except Common.InvalidPageException as e:
            Common.log("# Invalid page exception: %s" % e)
            self.crawlRetry(_key, msg)
        except Common.DenypageException as e:
            Common.log("# Deny page exception: %s" % e)
            self.crawlRetry(_key, msg)
            # 重新拨号
            try:
                self.dialRouter(4, "chn")
            except Exception as e:
                Common.log("# DailClient Exception err: %s" % e)
                time.sleep(random.uniform(10, 30))
            time.sleep(random.uniform(10, 30))
        except Common.SystemBusyException as e:
            Common.log("# System busy exception: %s" % e)
            self.crawlRetry(_key, msg)
            time.sleep(random.uniform(10, 30))
        except Common.RetryException as e:
            Common.log("# Retry exception: %s" % e)
            if self.giveup_val:
                msg["val"] = self.giveup_val
            self.crawlRetry(_key, msg)
            time.sleep(random.uniform(20, 30))
        except Exception as e:
            Common.log("# exception err: %s" % e)
            self.crawlRetry(_key, msg)
            Common.traceback_log()
            if str(e).find("Read timed out") == -1:
                # 重新拨号
                try:
                    self.dialRouter(4, "chn")
                except Exception as e:
                    Common.log("# DailClient Exception err: %s" % e)
                time.sleep(random.uniform(10, 30))

    def run_channel(self, msg):
        msg_val = msg["val"]
        c = Channel()
        c.antPage(msg_val)
        # self.items = c.channel_items
        self.run_items(c)

    # 并行获取商品
    def run_items(self, chan):
        Common.log("# Items start, channel_id:%s, channel_name:%s" % (str(chan.channel_id), chan.channel_name))
        # 多线程 控制并发的线程数
        Common.log("# Items num: %d" % len(chan.channel_items))
        if len(chan.channel_items) > Config.item_max_th:
            m_itemsObj = TCItemM(self._crawl_type, Config.item_max_th)
        else:
            m_itemsObj = TCItemM(self._crawl_type, len(chan.channel_items))
        m_itemsObj.createthread()
        m_itemsObj.putItems(chan.channel_items)
        m_itemsObj.run()

        item_list = m_itemsObj.items
        Common.log("# find Items num: %d" % len(chan.channel_items))
        Common.log("# crawl Items num: %d" % len(item_list))
        giveup_items = m_itemsObj.giveup_items
        if len(giveup_items) > 0:
            Common.log("# giveup Items num: %d" % len(giveup_items))
            raise Common.RetryException("# run_items: some items retry more than max times..")
        Common.log("# Items end, channel_id:%s, channel_name:%s" % (str(chan.channel_id), chan.channel_name))

    def process(self, _obj, _crawl_type, _val=None):
        # self.processMulti(_obj, _crawl_type, _val)
        self.processOne(_obj, _crawl_type, _val)

    def processOne(self, _obj, _crawl_type, _val=None):
        self.init_crawl(_obj, _crawl_type)

        i, M = 0, 20
        if _obj == "channel":
            M = 2
        n = 0
        while True:
            if _crawl_type and _crawl_type != "":
                _key = "%s_%s_%s" % (self.tc_type, _obj, _crawl_type)
            else:
                _key = "%s_%s" % (self.tc_type, _obj)
            _msg = self.redisQueue.get_q(_key)

            # 队列为空
            if not _msg:
                i += 1
                if i > M:
                    Common.log("# not get queue of key: %s" % _key)
                    Common.log("# all get num of item in queue: %d" % n)
                    break
                time.sleep(10)
                continue
            n += 1
            try:
                self.crawlPage(_obj, _crawl_type, _key, _msg, _val)
            except Exception as e:
                Common.log("# exception err in process of TCWorker: %s , key: %s" % (e, _key))
                Common.log(_msg)

    def processMulti(self, _obj, _crawl_type, _val=None):
        self.init_crawl(_obj, _crawl_type)
        if _crawl_type and _crawl_type != "":
            _key = "%s_%s_%s" % (self.tc_type, _obj, _crawl_type)
        else:
            _key = "%s_%s" % (self.tc_type, _obj)

        try:
            self.crawlPageMulti(_obj, _crawl_type, _key, _val)
        except Exception as e:
            Common.log("# exception err in processMulti of TCWorker: %s, key: %s" % (e, _key))

    # To crawl page
    def crawlPageMulti(self, _obj, _crawl_type, _key, _val):
        self.run_multiitems(_key, _val)
        # Common.log('# crawlPageMulti unknown obj = %s' % _obj)

    def run_multiitems(self, _key, _val):
        mitem = TCItemRedisM(_key, self._crawl_type, 20, _val)
        mitem.createthread()
        mitem.run()
        item_list = mitem.items
        Common.log("# crawl Items num: %d" % len(item_list))
Beispiel #55
0
class JHSGroupItemWorker():
    '''A class of JHS group item channel worker'''
    def __init__(self):
        # jhs group item type
        self.worker_type    = Config.JHS_GroupItem

        self.jhs_type       = Config.JHS_TYPE   # queue type

        # message
        self.message        = Message()

        # 获取Json数据
        self.jsonpage       = Jsonpage()

        # 抓取设置
        self.crawler        = TBCrawler()

        # 抓取时间设定
        self.crawling_time  = Common.now() # 当前爬取时间
        self.begin_time     = Common.now()
        self.begin_date     = Common.today_s()
        self.begin_hour     = Common.nowhour_s()

        # DB
        # mysql access
        self.mysqlAccess    = MysqlAccess()

        # redis queue
        self.redisQueue     = RedisQueue()

        # redis access
        self.redisAccess    = RedisAccess()

        # mongodb fs access
        self.mongofsAccess  = MongofsAccess()

    def init_crawl(self, _obj, _crawl_type):
        self._obj          = _obj
        self._crawl_type   = _crawl_type

        # dial client
        self.dial_client   = DialClient()

        # local ip
        self._ip           = Common.local_ip()

        # router tag
        self._router_tag   = 'ikuai'
        #self._router_tag  = 'tpent'

        # items
        self.items         = []

        # giveup items
        self.giveup_items  = []

        # giveup msg val
        self.giveup_val    = None

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._router_tag))
        except Exception as e:
            print '# To dial router exception :', e

    def push_back_list(self, L, v):
        L.extend(v)

    def push_back_val(self, L, v):
        L.append(v)

    # To crawl retry
    def crawlRetry(self, _key, msg):
        if not msg: return
        msg['retry'] += 1
        _retry = msg['retry']
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == 'groupitemcat':
            max_time = Config.json_crawl_retry
        elif _obj == 'groupitem':
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(_key, msg)
        else:
            #self.push_back(self.giveup_items, msg)
            print "# retry too many time, no get:", msg

    def crawlPage(self, _key, msg, _val):
        try:
            if self._obj == 'groupitemcat':
                self.run_category(msg, _val)
            else:
                print '# crawlPage unknown obj = %s' % self._obj
        except Common.InvalidPageException as e:
            print '# Invalid page exception:',e
            self.crawlRetry(_key,msg)
        except Common.DenypageException as e:
            print '# Deny page exception:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            try:
                self.dialRouter(4, 'chn')
            except Exception as e:
                print '# DailClient Exception err:', e
                time.sleep(random.uniform(10,30))
            time.sleep(random.uniform(10,30))
        except Common.SystemBusyException as e:
            print '# System busy exception:',e
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(10,30))
        except Common.RetryException as e:
            print '# Retry exception:',e
            if self.giveup_val:
                msg['val'] = self.giveup_val
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(20,30))
        except Exception as e:
            print '# exception err:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            if str(e).find('Read timed out') == -1:
                try:
                    self.dialRouter(4, 'chn')
                except Exception as e:
                    print '# DailClient Exception err:', e
            time.sleep(random.uniform(10,30))
            Common.traceback_log()

    def run_category(self, msg, _val):
        category_val = msg["val"]
        refers = _val
        c_url,c_name,c_id = category_val
        print c_url,c_name,c_id
        page = self.crawler.getData(c_url, refers)
        page_val = (page,c_name,c_id)
        ajax_url_list = self.getAjaxurlList(page_val,c_url)
        if len(ajax_url_list) > 0:
            self.get_jsonitems(ajax_url_list)

    # get json ajax url
    def getAjaxurlList(self, page_val, refers=''):
        url_list = []
        page, c_name, c_id = page_val
        p = re.compile(r'<.+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S)
        i = 0
        for a_info in p.finditer(page):
            c_subNav = c_name
            a_url = a_info.group(1).replace('amp;','')
            info = a_info.group(2)
            m = re.search(r'<span class="l-f-tbox">(.+?)</span>',info,flags=re.S)
            if m:
                c_subNav = m.group(1).strip()
            a_val = (c_id,c_name,refers,c_subNav)
            url_list.append((a_url,refers,a_val))
            i += 1
        return url_list

    # get item json list in category page from ajax url
    def get_jsonitems(self, ajax_url_list):
        # today all items val
        todayall_item_val = []
        # other sub nav items val
        item_list = []
        # process ajax url list
        item_json_index = 0
        # mongo json pages
        cat_pages = {}
        for a_url in ajax_url_list:
            # get json from ajax url
            Result_list = self.jsonpage.get_json([a_url])
            # mongo page json
            _url,_refers,_val = a_url 
            _c_id = _val[0]
            time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time))
            # timeStr_jhstype_webtype_itemgroupcat_catid
            key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1','itemgroupcat',str(_c_id))
            cat_pages[key] = '<!-- url=%s --> %s' % (_url,str(Result_list))

            if Result_list and len(Result_list) > 0:
                item_result_valList = self.jsonpage.parser_itemjson(Result_list)
                if item_result_valList and len(item_result_valList) > 0:
                    item_json_index += 1
                    # the first item list is all online items
                    if item_json_index == 1:
                        if len(item_result_valList) > 0:
                            print '# all online items.....'
                            todayall_item_val = item_result_valList
                    else:
                        self.push_back_list(item_list, item_result_valList)
                else:
                    print '# not get itemjson parse val list...'
        if len(item_list) > 0:
            self.parseItems(item_list)

        # cat pages json 
        for key in cat_pages.keys():
            _pages = (key,cat_pages[key])
            self.mongofsAccess.insertJHSPages(_pages)

    # 解析从接口中获取的商品数据
    def parseItems(self, item_list):
        print '# parse Group Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

        # 附加信息
        a_val = (self.begin_time,)
        # 多线程 控制并发的线程数
        max_th = Config.item_max_th
        if len(item_list) > max_th:
            m_itemsObj = JHSGroupItemParserM(self._crawl_type, max_th, a_val)
        else:
            m_itemsObj = JHSGroupItemParserM(self._crawl_type, len(item_list), a_val)
        m_itemsObj.createthread()
        m_itemsObj.putItems(item_list)
        m_itemsObj.run()

        _items = m_itemsObj.items
        self.push_back_list(self.items,_items)
        print '# queue item num:',len(self.items)
        print '# parse item num:',len(_items)
        print '# parse Group Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

    def process(self, _obj, _crawl_type, _val=None):
        self.init_crawl(_obj, _crawl_type)
        if _obj == 'groupitem':
            self.processMulti(_val)
        else:
            self.processOne(_val)

    def processOne(self, _val=None):
        i, M = 0, 10
        n = 0
        while True: 
            try:
                if self._crawl_type and self._crawl_type != '':
                    _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type)
                else:
                    _key = '%s_%s' % (self.jhs_type, self._obj)
                _msg = self.redisQueue.get_q(_key)

                # 队列为空
                if not _msg:
                    i += 1
                    if i > M:
                        print '# all get catQ item num:',n
                        print '# not get catQ of key:',_key
                        break
                    time.sleep(10)
                    continue
                n += 1
                self.crawlPage(_key, _msg, _val)

            except Exception as e:
                print '# exception err in process of JHSGroupItemWorker:',e,_key,_msg

    def processMulti(self, _val=None):
        if self._crawl_type and self._crawl_type != '':
            _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type)
        else:
            _key = '%s_%s' % (self.jhs_type, self._obj)

        try:
            self.crawlPageMulti(_key, _val)
        except Exception as e:
            print '# exception err in processMulti of JHSGroupItemWorker: %s, key: %s' % (e,_key)

    # To crawl page
    def crawlPageMulti(self, _key, _val):
        if self._obj == 'groupitem':
            self.run_groupitem(_key, _val)
        else:
            print '# crawlPageMulti unknown obj = %s' % self._obj

    def run_groupitem(self, _key, _val):
        m_itemQ = JHSGroupItemQM(self._obj, self._crawl_type, 20, _val)
        m_itemQ.createthread()
        m_itemQ.run()
        item_list = m_itemQ.items
        print '# crawl Items num: %d' % len(item_list)

    # 删除redis数据库过期商品
    def delItem(self, _items):
        for _item in _items:
            keys = [self.worker_type, _item["item_juId"]]
            
            item = self.redisAccess.read_jhsitem(keys)
            if item:
                end_time = item["end_time"]
                now_time = Common.time_s(self.begin_time)
                # 删除过期的商品
                if now_time > end_time: self.redisAccess.delete_jhsitem(keys)

    # 把商品信息存入redis数据库中
    def putItemDB(self, _items):
        for _item in _items:
            # 忽略已经存在的商品ID
            keys = [self.worker_type, _item["item_juId"]]
            if self.redisAccess.exist_jhsitem(keys): continue

            # 将商品基础数据写入redis
            item_val = self.message.itemInfo(_item["r_val"])
            val = self.message.itemMsg(item_val)
            self.redisAccess.write_jhsitem(keys, val)

    # 更新商品信息
    def updateItem(self, _item):
        keys = [self.worker_type, _item["item_juId"]]

        item = self.redisAccess.read_jhsitem(keys)
        if item:
            item_val = self.message.itemParseInfo(_item["r_val"])
            c = False
            if item["start_time"] != item_val["start_time"]:
                item["start_time"] = item_val["start_time"]
                c = True
            if item["end_time"] != item_val["end_time"]:
                item["end_time"] = item_val["end_time"]
                c = True
            if c:
                self.redisAccess.write_jhsitem(keys, item)

    # 查找新商品
    def selectNewItems(self, _items):
        new_items = []
        for _item in _items:
            keys = [self.worker_type, _item["item_juId"]]
            if self.redisAccess.exist_jhsitem(keys): 
                self.updateItem(_item)
                continue
            new_items.append(_item["val"])
        return new_items

    def scanEndItems(self):
        val = (Common.time_s(self.crawling_time),)
        _items = self.mysqlAccess.selectJhsGroupItemEnd(val)
        end_items = []
        # 遍历商品
        for _item in _items:
            item_juid = _item[0]
            end_items.append({"item_juId":str(item_juid)})
        print '# del item nums:',len(end_items)
        # 删除已经结束的商品
        self.delItem(end_items)

    def scanEndItemsLasthour(self):
        val = (Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -1))
        _items = self.mysqlAccess.selectJhsGroupItemEndLastOneHour(val)
        end_items = []
        # 遍历商品
        for _item in _items:
            item_juid = _item[0]
            end_items.append({"item_juId":str(item_juid)})
        print '# del item nums for last hour end:',len(end_items)
        # 删除已经结束的商品
        self.delItem(end_items)
            
    def scanAliveItems(self):
        # 到结束时间后的一个小时
        val = (Common.time_s(self.crawling_time), Common.add_hours(self.crawling_time, -1))
        # 查找已经开团但是没有结束的商品
        _items = self.mysqlAccess.selectJhsGroupItemAlive(val)
        print "# hour all item nums:",len(_items)
        return _items

    def scanNotEndItems(self):
        val = (Common.time_s(self.crawling_time),)
        # 查找没有结束的商品
        _items = self.mysqlAccess.selectJhsGroupItemNotEnd(val)
        i = 1
        for _item in _items:
            print i
            item_juid = str(_item[1])
            keys = [self.worker_type, item_juid]

            item = self.redisAccess.read_jhsitem(keys)
            print item
            #_new_item = {"crawling_time":item["crawling_time"],"item_juid":item["item_juId"],"groupcat_id":item["item_groupCatId"],"groupcat_name":item["item_groupCatName"],"item_ju_url":item["item_ju_url"],"item_juname":item["item_juName"],"item_id":item["item_id"],"start_time":item["start_time"],"end_time":item["end_time"]}
            #self.redisAccess.write_jhsitem(keys, _new_item)
            i += 1

    def scanCategories(self):
        category_list = self.mysqlAccess.selectJhsGroupItemCategory()
        return category_list
Beispiel #56
0
import urllib2
from RedisQueue import RedisQueue
redis = RedisQueue('jandan3')

def user_agent(url):
    req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'}
    req_timeout = 20
    req = urllib2.Request(url,None,req_header)
    page = urllib2.urlopen(req,None,req_timeout)
    html = page
    return html

while not redis.empty():
    down_url = redis.get()
    data = user_agent(down_url).read()
    with open('D:/Python/picture'+'/'+down_url[-11:],'wb')as code:
        code.write(data)
    print down_url
Beispiel #57
0
class JHSWorker():
    '''A class of jhs worker'''
    def __init__(self):
        # jhs brand type
        self.worker_type    = Config.JHS_Brand
        # DB
        self.jhs_type       = Config.JHS_TYPE   # queue type
        self.mysqlAccess    = MysqlAccess()     # mysql access
        self.redisQueue     = RedisQueue()      # redis queue
        self.redisAccess    = RedisAccess()     # redis db
        self.mongofsAccess  = MongofsAccess()   # mongodb fs access

        # 获取Json数据
        self.jsonpage       = Jsonpage()

        # 抓取设置
        self.crawler        = TBCrawler()

        # 页面模板解析
        self.brand_temp     = JHSBrandTEMP()

        # message
        self.message        = Message()

        # 抓取时间设定
        self.crawling_time  = Common.now() # 当前爬取时间
        self.begin_time     = Common.now()
        self.begin_date     = Common.today_s()
        self.begin_hour     = Common.nowhour_s()

    def init_crawl(self, _obj, _crawl_type):
        self._obj           = _obj
        self._crawl_type    = _crawl_type

        # dial client
        self.dial_client    = DialClient()

        # local ip
        self._ip            = Common.local_ip()

        # router tag
        self._router_tag    = 'ikuai'
        #self._router_tag   = 'tpent'

        # items
        self.items          = []

        # giveup items
        self.giveup_items   = []

        # giveup msg val
        self.giveup_val     = None

    # To dial router
    def dialRouter(self, _type, _obj):
        try:
            _module = '%s_%s' %(_type, _obj)
            self.dial_client.send((_module, self._ip, self._router_tag))
        except Exception as e:
            print '# To dial router exception :', e

    # To crawl retry
    def crawlRetry(self, _key, msg):
        if not msg: return
        msg['retry'] += 1
        _retry = msg['retry']
        _obj = msg["obj"]
        max_time = Config.crawl_retry
        if _obj == 'cat':
            max_time = Config.json_crawl_retry
        elif _obj == 'act':
            max_time = Config.act_crawl_retry
        elif _obj == 'item':
            max_time = Config.item_crawl_retry
        if _retry < max_time:
            self.redisQueue.put_q(_key, msg)
        else:
            #self.push_back(self.giveup_items, msg)
            print "# retry too many time, no get:", msg

     # To crawl page
    def crawlPage(self, _obj, _crawl_type, _key, msg, _val):
        try:
            if _obj == 'cat':
                if _crawl_type == 'home' or _crawl_type == 'homeposition':
                    self.run_cat_home(msg, _val)
                else:
                    self.run_cat(msg, _val)
            elif _obj == 'act':
                self.run_act(msg)
            elif _obj == 'item':
                self.run_item(msg, _val)
            else:
                print '# crawlPage unknown obj = %s' % _obj
        except Common.InvalidPageException as e:
            print '# Invalid page exception:',e
            self.crawlRetry(_key,msg)
        except Common.DenypageException as e:
            print '# Deny page exception:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            try:
                self.dialRouter(4, 'chn')
            except Exception as e:
                print '# DailClient Exception err:', e
                time.sleep(random.uniform(10,30))
            time.sleep(random.uniform(10,30))
        except Common.SystemBusyException as e:
            print '# System busy exception:',e
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(10,30))
        except Common.RetryException as e:
            print '# Retry exception:',e
            if self.giveup_val:
                msg['val'] = self.giveup_val
            self.crawlRetry(_key,msg)
            time.sleep(random.uniform(20,30))
        except Exception as e:
            print '# exception err:',e
            self.crawlRetry(_key,msg)
            # 重新拨号
            if str(e).find('Read timed out') == -1:
                try:
                    self.dialRouter(4, 'chn')
                except Exception as e:
                    print '# DailClient Exception err:', e
            time.sleep(random.uniform(10,30))
            Common.traceback_log()

    def run_cat_home(self, msg, _val):
        msg_val = msg["val"]
        _url, refers = msg_val
        print '# brand home:',_url
        page = self.crawler.getData(_url, refers)
        # save to mongo
        # timeStr_jhstype_webtype_obj_crawltype
        time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time))
        key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1',self._obj,self._crawl_type)
        p_content = '<!-- url=%s --> %s' % (_url,page)
        self.mongofsAccess.insertJHSPages((key,p_content))

        c_url_val_list = self.brand_temp.temp(page)
        for c_url_val in c_url_val_list:
            c_url, c_name, c_id = c_url_val
            self.items.append((Common.fix_url(c_url),c_id,c_name,Config.ju_brand_home,Config.JHS_Brand))

        if self._crawl_type == 'homeposition':
            top_acts = self.brand_temp.activityTopbrandTemp(page)
            print top_acts
            self.save_top_acts(top_acts)

    def save_top_acts(self, top_acts):
        if top_acts:
            for key in top_acts.keys():
                act = top_acts[key]
                c_time, act_id, act_name, act_position, act_url, f_id, f_name, sub_nav = Common.now(), -1, '', -1, '', 0, '', ''
                c_date, c_hour = time.strftime("%Y-%m-%d", time.localtime(c_time)), time.strftime("%H", time.localtime(c_time))
                if act.has_key('act_id'):
                    act_id = act["act_id"]
                if act.has_key('position'):
                    act_position = act["position"]
                if act.has_key('url'):
                    act_url = act["url"]
                if act.has_key('datatype'):
                    f_name = act["datatype"]
                val = (Common.time_s(c_time),act_id,act_name,act_url,Config.JHS_Brand,sub_nav,act_position,f_id,f_name,'',c_date,c_hour)
                self.mysqlAccess.insertJhsActPosition_hour(val)

    def run_cat(self, msg, _val):
        msg_val = msg["val"]
        c_url, c_id, c_name, refers, pagetype = msg_val
        print '# category',c_name,c_id
        if pagetype == Config.JHS_Brand:
            a_val = (c_id, c_name)
            self.get_actjson(c_url, refers, a_val, _val, pagetype)
        elif pagetype == Config.JHS_GroupItem:
            self.get_cat_jsons(c_url, c_id, c_name, refers, _val, pagetype)
        else:
            print '# not get category pagetype...'

    def get_cat_jsons(self, c_url, c_id, c_name, refers, _val, pagetype):
        a_val = (c_id, c_name)
        page = self.crawler.getData(c_url, refers)
        page_val = (page,c_id,c_name)
        ajax_url_list = self.getAjaxurlList(page_val)
        if len(ajax_url_list) > 0:
            # process ajax url list
            for url_val in ajax_url_list:
                c_url,c_subNav = url_val
                self.get_actjson(c_url, refers, a_val, _val, pagetype, c_subNav)

    def get_actjson(self, c_url, refers, a_val, _val, pagetype, c_subNav=''):
        if self._crawl_type == 'position':
            _val = (pagetype,c_subNav) + _val

        Result_list = self.jsonpage.get_jsonPage(c_url,refers,a_val)
        if Result_list and len(Result_list) > 0:
            # parser act result
            act_valList = self.jsonpage.parser_brandjson(Result_list,_val)
            if act_valList and len(act_valList) > 0:
                print '# get brand act num:',len(act_valList)
                self.items.extend(act_valList)
            else:
                print '# not get brandjson parse val list...'

    # get json ajax url
    def getAjaxurlList(self, page_val):
        url_list = []
        page, c_id, c_name = page_val
        p = re.compile(r'<.+?id="(.+?)".+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S)
        i = 0
        for a_info in p.finditer(page):
            c_subNav = ''
            f_id = a_info.group(1)
            a_url = a_info.group(2).replace('amp;','')
            info = a_info.group(3)
            m = re.search(r'<span class="l-f-tbox">(.+?)</span>', info, flags=re.S)
            if m:
                c_subNav = m.group(1).strip()
            if c_subNav == '':
                m = re.search(r'<td.+?data-target="%s".+?>(.+?)</td>' % f_id, page, flags=re.S)
                if m:
                    c_subNav = re.sub(r'<.+?>','',m.group(1))
            #url_list.append((a_url,refers,a_val))
            url_list.append((a_url,c_subNav))
            i += 1
        return url_list

    # ACT queue
    def run_act(self, msg):
        # 默认数据
        msg_val = msg["val"]
        print '# act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        act_obj = None
        if self._crawl_type == 'main':
            act_obj = JHSAct()
            act_obj.antPageMain(msg_val)
        elif self._crawl_type == 'check':
            act_obj = JHSAct()
            act_obj.antPageCheck(msg_val)
        elif self._crawl_type == 'position':
            act_obj = JHSAct()
            act_obj.antPageParser(msg_val)
        print '# act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

        if self._crawl_type == 'position':
            brandact_id,brandact_name,brandact_url,brandact_sign,brandact_status,val = act_obj.outTupleForPosition()
            if int(brandact_sign) != 3:
                if act_obj.brandact_starttime and act_obj.brandact_starttime != 0.0 and 1 >= Common.subTS_hours(int(float(act_obj.brandact_starttime)/1000), self.crawling_time):
                    print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name)
                    self.mysqlAccess.insertJhsActPosition_hour(val)
                
                elif brandact_status != '' and brandact_status != 'blank':
                    print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name)
                    self.mysqlAccess.insertJhsActPosition_hour(val)
        else:
            act_keys = [self.worker_type, str(act_obj.brandact_id)]
            prev_act = self.redisAccess.read_jhsact(act_keys)
            # 是否需要抓取商品
            if act_obj and act_obj.crawling_confirm != 2:
                # 保存的活动信息
                self.putActDB(act_obj, prev_act)
                # 活动中的商品
                items_list = []
                # 只取非俪人购商品
                if int(act_obj.brandact_sign) != 3:
                    if act_obj.crawling_confirm == 0:
                        #更新马上开团活动中商品位置
                        self.update_actItems_position(act_obj)
                    # 多线程抓商品
                    items_list = self.run_actItems(act_obj, prev_act)
                else:
                    print '# ladygo activity id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name)

                #print '# pro act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                # 处理活动信息
                #self.procAct(act_obj, prev_act, items_list)
                # 处理活动redis信息
                self.procActRedis(act_obj, prev_act, items_list)
                #print '# pro act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            else:
                self.update_startact(act_obj, prev_act)
                print '# Already start activity, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) 

    # 更新开团后活动
    def update_startact(self, act, prev_act):
        if act.brandact_endtime and act.brandact_endtime != 0.0:
            end_time_s = Common.time_s(float(act.brandact_endtime)/1000)
            if prev_act and end_time_s != prev_act['end_time']:
                prev_act['end_time'] = end_time_s
                # redis
                keys = [self.worker_type, str(act.brandact_id)]
                self.redisAccess.write_jhsact(keys, prev_act)
                self.mysqlAccess.updateJhsActEndtime((end_time_s,str(act.brandact_id)))

    #更新马上开团活动中商品位置
    def update_actItems_position(self, act):
        update_val_list = []
        act_id = act.brandact_id
        for item in act.brandact_itemVal_list:
            if str(item[7]) != '':
                update_val_list.append((str(item[7]),str(act_id),item[4]))
        self.mysqlAccess.updateJhsItemPosition(update_val_list)

    # 并行获取品牌团商品
    def run_actItems(self, act, prev_act):
        print '# act items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        # 需要抓取的item
        item_val_list = []
        # 过滤已经抓取过的商品ID列表
        item_ids = act.brandact_itemids
        if prev_act:
            prev_item_ids = prev_act["item_ids"]
            item_ids      = Common.diffSet(item_ids, prev_item_ids)

            # 如果已经抓取过的活动没有新上线商品,则退出
            if len(item_ids) == 0:
                print '# Activity no new Items'
                print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name
                return None

            for item in act.brandact_itemVal_list:
                if str(item[6]) in item_ids or str(item[7]) in item_ids:
                    item_val_list.append(item)
        else:
            item_val_list = act.brandact_itemVal_list

        # 如果活动没有商品, 则退出
        if len(item_ids) == 0:
            print '# run_brandItems: no items in activity, act_id=%s, act_name=%s' % (act.brandact_id,act.brandact_name)
            return None

        print '# Activity Items crawler start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name
        # 多线程 控制并发的线程数
        if len(item_val_list) > Config.item_max_th:
            m_itemsObj = JHSItemM('main', Config.item_max_th)
        else: 
            m_itemsObj = JHSItemM('main', len(item_val_list))
        m_itemsObj.createthread()
        m_itemsObj.putItems(item_val_list)
        m_itemsObj.run()

        item_list = m_itemsObj.items
        print '# Activity find new Items num:', len(item_val_list)
        print '# Activity crawl Items num:', len(item_list)
        giveup_items = m_itemsObj.giveup_items
        if len(giveup_items) > 0:
            print '# Activity giveup Items num:',len(giveup_items)
            raise Common.RetryException('# run_actItems: actid:%s actname:%s some items retry more than max times..'%(str(act.brandact_id),str(act.brandact_name)))
        print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name
        return item_list

    # To merge activity
    def mergeAct(self, act, prev_act):
        if prev_act:
            # 合并本次和上次抓取的商品ID列表
            prev_item_ids  = prev_act["item_ids"]
            act.brandact_itemids   = Common.unionSet(act.brandact_itemids, prev_item_ids)

            # 取第一次的活动抓取时间
            act.crawling_time = Common.str2timestamp(prev_act["crawl_time"])

            if not act.brandact_name or act.brandact_name == '':
                act.brandact_name = prev_act["act_name"]
            if not act.brandact_url or act.brandact_url == '':
                act.brandact_url = prev_act["act_url"]
            if not act.brandact_position or str(act.brandact_position) == '0':
                act.brandact_position = prev_act["act_position"]
            if not act.brandact_enterpic_url or act.brandact_enterpic_url == '':
                act.brandact_enterpic_url = prev_act["act_enterpic_url"]
            if not act.brandact_remindNum or str(act.brandact_remindNum) == '0':
                act.brandact_remindNum = prev_act["act_remindnum"]
            if not act.brandact_coupons or act.brandact_coupons == []:
                act.brandact_coupon = prev_act["act_coupon"]
                act.brandact_coupons = prev_act["act_coupons"].split(Config.sep)
            if not act.brandact_starttime or act.brandact_starttime == 0.0: 
                act.brandact_starttime = Common.str2timestamp(prev_act["start_time"])
            if not act.brandact_endtime or act.brandact_endtime == 0.0:
                act.brandact_endtime = Common.str2timestamp(prev_act["end_time"])
            if not act.brandact_other_ids or act.brandact_other_ids == '':
                act.brandact_other_ids = prev_act["_act_ids"]

    # To put act db
    def putActDB(self, act, prev_act):
        # 预热信息
        if self._crawl_type == 'main':
            self.mysqlAccess.insertJhsActComing(act.outSql()) 

        # redis
        self.mergeAct(act, prev_act)
        
        if self._crawl_type == 'main':
            # mysql
            if prev_act:
                print '# update activity, id:%s name:%s'%(act.brandact_id, act.brandact_name)
                self.mysqlAccess.updateJhsAct(act.outSqlForUpdate())
            else:
                print '# insert activity, id:%s name:%s'%(act.brandact_id, act.brandact_name)
                self.mysqlAccess.insertJhsAct(act.outSql())

        # mongo
        # 存网页
        _pages = act.outItemPage(self._crawl_type)
        self.mongofsAccess.insertJHSPages(_pages)

    # To process activity in redis
    def procActRedis(self, act, prev_act, items_list):
        # 活动抓取的item ids
        act.brandact_itemids = []
        if items_list:
            for item in items_list:
                # item juid
                if str(item[1]) != '':
                    act.brandact_itemids.append(str(item[1]))
                # item id
                if str(item[10]) != '':
                    act.brandact_itemids.append(str(item[10]))

        # redis
        self.mergeAct(act, prev_act)
        keys = [self.worker_type, str(act.brandact_id)]
        val = act.outTupleForRedis()
        self.redisAccess.write_jhsact(keys, val)

    # To process activity
    def procAct(self, act, prev_act, items_list):
        # 活动抓取的item ids
        act.brandact_itemids = []
        if items_list:
            for item in items_list:
                # item juid
                if str(item[1]) != '':
                    act.brandact_itemids.append(str(item[1]))
                # item id
                if str(item[10]) != '':
                    act.brandact_itemids.append(str(item[10]))
        # 将抓取的活动信息存入redis
        self.putActDB(act, prev_act)

    # ITEM queue
    def run_item(self, msg, _val):
        # 默认数据
        msg_val = msg["val"]
        brandact_id, brandact_name, item_val_list = msg_val
        print '# Activity Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name
        # 多线程 控制并发的线程数
        max_th = Config.item_max_th
        if len(item_val_list) > max_th:
            m_itemsObj = JHSItemM(self._crawl_type, max_th, _val)
        else:
            m_itemsObj = JHSItemM(self._crawl_type, len(item_val_list), _val)
        m_itemsObj.createthread()
        m_itemsObj.putItems(item_val_list)
        m_itemsObj.run()

        item_list = m_itemsObj.items
        print '# Activity Items num:', len(item_val_list)
        print '# Activity crawl Items num:', len(item_list)
        giveup_items = m_itemsObj.giveup_items
        if len(giveup_items) > 0:
            print '# Activity giveup Items num:',len(giveup_items)
            self.giveup_val = (brandact_id, brandact_name, giveup_items)
            raise Common.RetryException('# run_item: actid:%s actname:%s some items retry more than max times..'%(str(brandact_id),str(brandact_name)))
        print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name

    def process(self, _obj, _crawl_type, _val=None):
        self.init_crawl(_obj, _crawl_type)

        i, M = 0, 20
        if _obj == 'cat':
            M = 10
        n = 0
        while True: 
            if _crawl_type and _crawl_type != '':
                _key = '%s_%s_%s' % (self.jhs_type,_obj,_crawl_type)
            else:
                _key = '%s_%s' % (self.jhs_type,_obj)
            _msg = self.redisQueue.get_q(_key)

            # 队列为空
            if not _msg:
                i += 1
                if i > M:
                    print '# not get queue of key:',_key,time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                    print '# all get num of item in queue:',n
                    break
                time.sleep(10)
                continue
            n += 1
            try:
                self.crawlPage(_obj, _crawl_type, _key, _msg, _val)
            except Exception as e:
                print '# exception err in process of JHSWorker:',e,_key,_msg

    # 删除redis数据库过期活动
    def delAct(self, _acts):
        i = 0
        for _act in _acts:
            keys = [self.worker_type, str(_act[0])]

            item = self.redisAccess.read_jhsact(keys)
            if item:
                end_time = item["end_time"]
                now_time = Common.time_s(self.crawling_time)
                # 删除过期的活动
                if now_time > end_time: 
                    i += 1
                    self.redisAccess.delete_jhsact(keys)
        print '# delete acts num:',i

    def delItem(self, _items):
        i = 0
        for _item in _items:
            keys = [self.worker_type, str(_item[0])]

            item = self.redisAccess.read_jhsitem(keys)
            if item:
                end_time = item["end_time"]
                now_time = Common.time_s(self.crawling_time)
                # 删除过期的商品
                if now_time > end_time: 
                    i += 1
                    self.redisAccess.delete_jhsitem(keys)
        print '# delete items num:',i

    # 查找结束的活动
    def scanEndActs(self, val):
        _acts = self.mysqlAccess.selectJhsActEnd(val)
        print '# end acts num:',len(_acts)
        # 删除已经结束的活动
        self.delAct(_acts)

    # 查找结束的商品
    def scanEndItems(self, val):
        _items = self.mysqlAccess.selectJhsItemEnd(val)
        print '# end items num:',len(_items)
        # 删除已经结束的商品
        self.delItem(_items)

    # acts redis
    def actsRedis(self):
        _acts = self.mysqlAccess.selectActsRedisdata()
        print '# acts num:',len(_acts)
        i = 0
        for _act in _acts:
            act_id = _act[2]
            #_itemids = self.mysqlAccess.selectItemsids(str(act_id))
            #item_ids = []
            #for _itemid in _itemids:
            #    item_ids.append(str(_itemid[0]))
            #    item_ids.append(str(_itemid[1]))
            #act_val = _act + (item_ids,)
            #print act_val
            #keys = [self.worker_type, str(act_id)]
            #print keys
            #if self.redisAccess.exist_jhsact(keys):
                #act_redis = self.redisAccess.read_jhsact(keys)
                #if len(act_redis) != 15:
                #    print act_redis
                #    i += 1
                #print self.redisAccess.read_jhsact(keys)
                #self.redisAccess.delete_jhsact(keys)
            #self.redisAccess.write_jhsact(keys, act_val)
            #i += 1
            #break
        print '# redis acts num:',i

    # items redis
    def itemsRedis(self):
        _items = self.mysqlAccess.selectItemRedisdata()
        print '# items num:', len(_items)
        i = 0
        #for _item in _items:
            #msg = self.message.jhsitemMsg(_item)
            #print msg
            #keys = [self.worker_type, str(_item[0])]
            #print keys
            #if self.redisAccess.exist_jhsitem(keys):
                #print self.redisAccess.read_jhsitem(keys)
                #self.redisAccess.delete_jhsitem(keys)
            #self.redisAccess.write_jhsitem(keys, msg)
            #i += 1 
            #break
        print '# redis items num:',i
Beispiel #58
0
#/usr/bin/python
#coding=utf-8
import urllib3
import base64
import time
import threading

from queue_config import master_host, author_login
from RedisQueue import RedisQueue

redis_conn = {'host': master_host[0] ,'port':master_host[1]}

q = RedisQueue('account_login', **redis_conn)
http = urllib3.PoolManager(num_pools=50)

def worker(value):
    params = {}
    params['account_login'] = base64.encodestring(value)
    r = http.request('POST', author_login, params)

    #服务器失败,重新压回队列
    if r.status != 200:
        q.put(value)

    #IP白名单验证失败,重新压回队列
    if r.data['status'] == 10002:
        q.put(value)
    print r.data

while 1:
    # time.sleep(1);