def run(self): sleep(10) self.last_mean = .015 self.q = RedisQueue('test') print('start') self.conn = sqlite3.connect("data.db") while not self.q.empty(): features = str(self.q.get())[3:-2].replace("'","").split(', ') self.features = list(features) for self.hold_time in ['_10']: df = self.df[self.features+['stock_perc_change'+self.hold_time, 'abnormal_perc_change'+self.hold_time]] targets = [self.df['stock_perc_change'+self.hold_time], self.df['abnormal_perc_change'+self.hold_time]] positive_dfs = [] negative_dfs = [] for i in range(8): a_train, a_test, b_train, b_test = train_test_split(df.ix[:,:-2], df.ix[:,-2:], test_size=.4) self.train(a_train, b_train) test_result, negative_df, positive_df = self.test(a_test, b_test) if test_result: positive_dfs.append(positive_df) negative_dfs.append(negative_df) else: break if test_result: self.get_result(pd.concat(positive_dfs), pd.concat(negative_dfs))
class Chunker(object): def __init__(self, redis_host): self.work_queue = RedisQueue(redis_host, "inqueue") def run(self): chunk_id = 0 a_range = xrange(1,10) + xrange(10,256) for a in shuffle(a_range): for b in shuffle(xrange(1, 255)): if a == 172 and b in xrange(16,32): continue if a == 192 and b == 168: continue for c in shuffle(xrange(1, 255)): ip_range = "{0}.{1}.{2}.0/24".format(a, b, c) print "Sending chunk {0} range: {1}".format(chunk_id, ip_range) task = { "range": ip_range, "id": chunk_id } self.work_queue.put(task) chunk_id += 1 sleep(10) def run_test(self): self.work_queue.put({"range": "129.21.50.0/24", "id":0}) self.work_queue.put({"range": "129.21.49.0/24", "id":1})
class TCQ(): '''A class of tc redis queue''' def __init__(self, _obj, _q_type=None): self._obj = _obj self._q_type = _q_type # crawler type self.tc_type = Config.TC_TYPE # queue type # DB self.redisQueue = RedisQueue() # redis queue # message self.message = Message() # queue key if self._q_type: self._key = '%s_%s_%s' % (self.tc_type, self._obj, self._q_type) else: self._key = '%s_%s' % (self.tc_type, self._obj) # clear queue def clearQ(self): self.redisQueue.clear_q(self._key) # 写入redis queue def putQ(self, _msg): self.redisQueue.put_q(self._key, _msg) # 转换msg def putlistQ(self, item_list): for _item in item_list: _val = (0, self._obj, self._q_type) + _item msg = self.message.QueueMsg(self._obj, _val) if msg: self.putQ(msg)
class JHSQ(): '''A class of jhs redis queue''' def __init__(self, _obj, _q_type=None): self._obj = _obj self._q_type = _q_type # queue type self.jhs_type = Config.JHS_TYPE # queue type # DB self.redisQueue = RedisQueue() # redis queue # message self.message = Message() # queue key if self._q_type: self._key = '%s_%s_%s' % (self.jhs_type, self._obj, self._q_type) else: self._key = '%s_%s' % (self.jhs_type, self._obj) # clear queue def clearQ(self): self.redisQueue.clear_q(self._key) # 写入redis queue def putQ(self, _msg): self.redisQueue.put_q(self._key, _msg) # 转换msg def putlistQ(self, item_list): for _item in item_list: _val = (0,self._obj,self.jhs_type) + _item msg = self.message.jhsQueueMsg(self._obj, _val) if msg: self.putQ(msg)
def __init__(self, redis_host, es_urls): self.pages_queue = RedisQueue(redis_host, "pagesqueue") # take pages out of this queue self.links_queue = RedisQueue(redis_host, "linksqueue") # put links into this queue self.connection = pyelasticsearch.ElasticSearch(es_urls) try: self.connection.create_index("webpages") except: pass
def __init__(self, in_queue_namespace, out_queue_namespace): self.in_queue_namespace = in_queue_namespace self.out_queue_namespace = out_queue_namespace self.in_queue = RedisQueue(in_queue_namespace) self.out_queue = RedisQueue(out_queue_namespace) print "Parser worker loaded"
def start_boss_task(): pos_lst = ['JAVA', 'C', 'Python', 'PHP', 'IOS', 'Android'] url = ['https://www.zhipin.com/c101010100-p100104/?page={page}&ka=page-{page}'.format(page=str(i + 1)) for i in xrange(2)] for p in pos_lst: url += ['https://www.zhipin.com/c101010100/h_101010100/?query={pos}&page={page}&ka=page-{page}'.format( page=str(i + 1), pos=p) for i in xrange(2)] rq = RedisQueue() rq.push_task('boss_root', url, level=2)
def __init__(self, in_queue_namespace, out_queue_namespace, apikey): self.in_queue_namespace = in_queue_namespace self.out_queue_namespace = out_queue_namespace self.apikey = apikey self.in_queue = RedisQueue(in_queue_namespace) self.out_queue = RedisQueue(out_queue_namespace) print "Fetcher loaded with apikey", self.apikey
class RedisMessageProvider(MessageProvider): def __init__(self, host, port, queue_name): self.queue = RedisQueue(name=queue_name, namespace='queue', host=host, port=port) self.queue.wait_for() def get_message(self): return self.queue.get()
class QueueManage(object): def __init__(self, name): self.q_obj = RedisQueue(name) def get_queue_data(self): q_re = self.q_obj.get_all() return q_re def queue_size(self): return self.q_obj.qsize()
def getQueue(self, ipaddr, port, name, namespace="queues", fromcache=True): if not fromcache: return RedisQueue(self.get(ipaddr, port, fromcache=False), name, namespace=namespace) key = "%s_%s_%s_%s" % (ipaddr, port, name, namespace) if key not in self._redisq: self._redisq[key] = RedisQueue(self.get(ipaddr, port), name, namespace=namespace) return self._redisq[key]
def main(): done_que = RedisQueue('seed') run_que = RedisQueue('run') # workder_download seeds = ['http://www.2345.com'] workder_download = Worker(seeds, done_que, run_que) try: workder_download.work() except KeyboardInterrupt: print "Ctrl+C" if workder_download.debugnosave == 0: workder_download.savestate()
def dump_traffic(): global packets_dump global redis_packet_queue global redis_results_queue redis_packet_queue = RedisQueue('packet_worker_queue') redis_results_queue = RedisQueue('packet_results_queue') print('[*] Packet dumping thread is now online') while True: ts = time.time() date = dt.datetime.fromtimestamp(ts).strftime('%d-%m-%Y_%H:%M:%S') save_as_csv('./packet_dump{}.csv'.format(date), ['Status code', 'Method', 'Version', 'Scheme', 'Request Length', 'Response Length', 'Request Entropy', 'Response Entropy', 'Client Connection', 'Server Connection'], packets_dump) time.sleep(THREAD_SLEEP_TIME)
class Crawler(object): def __init__(self, redis_host, depth=10): self.links_queue = RedisQueue(redis_host, "linksqueue") self.pages_queue = RedisQueue(redis_host, "pagesqueue") def run(self): while True: link = self.links_queue.get().data try: page = WebPage(requests.get(link).text, link, 80) except: print("Exception GETing {0}".format(link)) continue self.pages_queue.put(page.to_dict())
def __init__(self, _obj, _q_type=None): self._obj = _obj self._q_type = _q_type # crawler type self.tc_type = Config.TC_TYPE # queue type # DB self.redisQueue = RedisQueue() # redis queue # message self.message = Message() # queue key if self._q_type: self._key = '%s_%s_%s' % (self.tc_type, self._obj, self._q_type) else: self._key = '%s_%s' % (self.tc_type, self._obj)
def __init__(self): # jhs brand type self.worker_type = Config.JHS_Brand # DB self.jhs_type = Config.JHS_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.redisAccess = RedisAccess() # redis db self.mongofsAccess = MongofsAccess() # mongodb fs access # 获取Json数据 self.jsonpage = Jsonpage() # 抓取设置 self.crawler = TBCrawler() # 页面模板解析 self.brand_temp = JHSBrandTEMP() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s()
def __init__(self): # jhs group item type self.worker_type = Config.JHS_GroupItem self.jhs_type = Config.JHS_TYPE # queue type # message self.message = Message() # 获取Json数据 self.jsonpage = Jsonpage() # 抓取设置 self.crawler = TBCrawler() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() # DB # mysql access self.mysqlAccess = MysqlAccess() # redis queue self.redisQueue = RedisQueue() # redis access self.redisAccess = RedisAccess() # mongodb fs access self.mongofsAccess = MongofsAccess()
def process_request_origin(self, request, spider): redis = RedisQueue('proxy_ip') if not redis.empty(): proxy_ip = redis.get() else: proxy_ip = get_ip() proxy_para = { 'ip_port': proxy_ip, 'user_pass': '' } request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para['ip_port'] redis.put(proxy_ip)
class Receiver(object): def __init__(self, redis_host): self.output_queue = RedisQueue(redis_host, "outqueue") def run(self): while True: result = self.output_queue.get().data pprint(result) print "---" def run_dump(self): dumpfile = open("netcrawl.log", "w") while True: result = self.output_queue.get().data pprint(result) dumpfile.write(pformat(result) + "\n") dumpfile.flush()
def getRedisQueue(self, ipaddr, port, name, namespace="queues", fromcache=True): if not fromcache: return RedisQueue(self.getRedisClient(ipaddr, port, fromcache=False), name, namespace=namespace) key = "%s_%s_%s_%s" % (ipaddr, port, name, namespace) if not self.redisq.has_key(key): self.redisq[key] = RedisQueue(self.getRedisClient(ipaddr, port), name, namespace=namespace) return self.redisq[key]
def getGeventRedisQueue(self, ipaddr, port, name, namespace="queues", fromcache=False): fromcache = False # @todo remove if not fromcache: return RedisQueue(self.getGeventRedisClient(ipaddr, port, False), name, namespace=namespace) key = "%s_%s_%s_%s" % (ipaddr, port, name, namespace) if not self.gredisq.has_key(key): self.gredisq[key] = RedisQueue(self.getGeventRedisClient( ipaddr, port), name, namespace=namespace) return self.gredisq[key]
class Indexer(object): def __init__(self, redis_host, es_urls): self.pages_queue = RedisQueue(redis_host, "pagesqueue") # take pages out of this queue self.links_queue = RedisQueue(redis_host, "linksqueue") # put links into this queue self.connection = pyelasticsearch.ElasticSearch(es_urls) try: self.connection.create_index("webpages") except: pass def run(self): while True: result = self.pages_queue.get().data result['tags'] = genTags(result['html']) self.connection.index('webpages', 'webpage', result, id=result['ip']) print('Indexed {0}'.format(result['ip'])) for link in result['links']: self.links_queue.put(link)
class StudyscrapyPipeline(object): def __init__(self): self.q = RedisQueue(name='CSDN', host='localhost', port=6379, db=3) if redis_db.hlen(redis_data_dict) == 0: pass def process_item(self, item, spider): # fp = open(r'F:\Spider\Spider\studyscrapy\out.txt', 'a+') if redis_db.hexists(redis_data_dict, item['title']): print('数据已存入队列 <--') pass else: # fp.write(item['title']+', '+item['time']+'\n') self.q.put(item['title'] + ':' + item['time']) redis_db.hset(redis_data_dict, item['title'], item['time']) print('title: {0},time: {1} 存入队列成功'.format(item['title'], item['time'])) return item
def index(): stats = { 'currentTemp': 0, 'currentHumidity': 0, 'lastUpdateTime': "never", 'message': "Not initialized", 'targetTemp': 0, 'brightness': 0 } form = UpdateForm() q = RedisQueue('brooder') dataPoints = [] for item in q.getall(): dataPoints.append(json.loads(str(item, 'utf-8'))) if len(dataPoints) > 0: stats = dataPoints[-1] brooderConfig = json.load(open(brooderConfigFile)) return render_template('main.html', **locals())
def __init__(self): # tc spot type self.worker_type = Config.TC_Spot # DB self.tc_type = Config.TC_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # 抓取设置 self.crawler = TCCrawler() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s()
def dispose_ip(self, proxy_ip, redis_label): redis_list = [] for i in range(REDIS_NUM): redis_list.append(RedisQueue('proxy_ip_%d' %i)) redis_invalid_ip = RedisQueue('invalid_ip') if redis_label == REDIS_NUM - 1: redis_invalid_ip.put(proxy_ip) redis_list[0].put(get_ip()) else: redis_list[redis_label].remove(proxy_ip) redis_list[redis_label+1].put(proxy_ip) if redis_list[0].empty(): redis_list[0].put(get_ip()) new_redis_label = random.choice(range(REDIS_NUM)) while redis_list[new_redis_label].empty(): new_redis_label = random.choice(range(REDIS_NUM)) new_proxy_ip = redis_list[new_redis_label].get() redis_list[new_redis_label].put(new_proxy_ip) return new_proxy_ip,new_redis_label
def dispose_ip(self, proxy_ip, redis_label): redis_list = [] for i in range(REDIS_NUM): redis_list.append(RedisQueue('proxy_ip_%d' % i)) redis_invalid_ip = RedisQueue('invalid_ip') if redis_label == REDIS_NUM - 1: redis_invalid_ip.put(proxy_ip) redis_list[0].put(get_ip()) else: redis_list[redis_label].remove(proxy_ip) redis_list[redis_label + 1].put(proxy_ip) if redis_list[0].empty(): redis_list[0].put(get_ip()) new_redis_label = random.choice(range(REDIS_NUM)) while redis_list[new_redis_label].empty(): new_redis_label = random.choice(range(REDIS_NUM)) new_proxy_ip = redis_list[new_redis_label].get() redis_list[new_redis_label].put(new_proxy_ip) return new_proxy_ip, new_redis_label
def select_ip(REDIS_NUM): redis_list = [] for i in range(REDIS_NUM): redis_list.append(RedisQueue('proxy_ip_%d' % i)) for each in redis_list: print each.key label = random.choice(range(REDIS_NUM)) while redis_list[label].empty(): label = random.choice(range(REDIS_NUM)) proxy_ip = redis_list[label].get() redis_list[label].put(proxy_ip) return proxy_ip, label
def main(): done_que = RedisQueue('seed') run_que = RedisQueue('run') run_que.flushdb() conn = sqlite3.connect('site_data.db') conn.execute( "create table if not exists mainpages (id integer primary key autoincrement, url TEXT,headers TEXT,content BLOB)" ) spend = 0 cnt = 0 size = 0 while True: data = cPickle.loads(done_que.get()) st = time.time() urls = geturls(data['url'], data['content']) if len(urls) == 0: continue for url in urls: if url not in bfdone: run_que.put(url) gziphtml = sqlite3.Binary(gzip.zlib.compress(data['content'])) size += len(gziphtml) conn.execute( "insert into mainpages (url,headers,content) values (?,?,?)", (data['url'], str(data['headers']), gziphtml)) et = time.time() spend += (et - st) cnt += 1 if cnt % 10 == 0: print "cost:", spend / cnt, cnt, done_que.qsize( ), size / 1024 / 1024 conn.commit()
class FetcherWorker: def __init__(self, in_queue_namespace, out_queue_namespace, apikey): self.in_queue_namespace = in_queue_namespace self.out_queue_namespace = out_queue_namespace self.apikey = apikey self.in_queue = RedisQueue(in_queue_namespace) self.out_queue = RedisQueue(out_queue_namespace) print "Fetcher loaded with apikey", self.apikey def run(self): while 1: base_url = self.in_queue.get() if base_url == "None": # add end-of-queue markers for parsers self.out_queue.put("None") # ends program break url = base_url + self.apikey t1 = time.time() print "fetching try 1", url resp = urllib2.urlopen(url) if resp.code == 200: text = resp.read() self.out_queue.put(text) else: print 'failed once', url time.sleep(10) print "fetching try 2", url resp = urllib2.urlopen(url) if resp.code == 200: text = resp.read() self.out_queue.put(text) print "done fetching" # make sure we don't use the same API key within 2 seconds t2 = time.time() if t2 - t1 < 2.0: time.sleep(2.0 - (t2 - t1))
def __init__(self, _obj, _q_type=None): self._obj = _obj self._q_type = _q_type # queue type self.jhs_type = Config.JHS_TYPE # queue type # DB self.redisQueue = RedisQueue() # redis queue # message self.message = Message() # queue key if self._q_type: self._key = '%s_%s_%s' % (self.jhs_type, self._obj, self._q_type) else: self._key = '%s_%s' % (self.jhs_type, self._obj)
def __init__(self, key, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.xc_type = Config.XC_TYPE # xc type #self.item_type = q_type # item queue type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # xc queue type self.xc_queue_type = q_type # new... self._key = key # redis queue key # appendix val self.a_val = a_val # return items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = []
def main(run_type, store_num): q = RedisQueue(store_num) if run_type == 'gen': for i in range(len(sectorPolygons1954.sectors)): s = generateSectors(sectorPolygons1954.sectors[i], i, q, store_num) print('starting thread ' + str(i)) s.start() elif run_type == 'run': batch_size = 30 uri = 'localhost:27017' client = pymongo.MongoClient(uri) db = client['streets'] streets = db[store_num] streets.create_index([("latitude", pymongo.DESCENDING), ("longitude", pymongo.DESCENDING)]) for i in range(int(batch_size)): rg = request_getter(q, store_num) print('starting request thread ' + str(i)) rg.start() while q.qsize(): sleep(10) print(q.qsize())
class ParserWorker(): def __init__(self, in_queue_namespace, out_queue_namespace): self.in_queue_namespace = in_queue_namespace self.out_queue_namespace = out_queue_namespace self.in_queue = RedisQueue(in_queue_namespace) self.out_queue = RedisQueue(out_queue_namespace) print "Parser worker loaded" def run(self): while 1: xml_text = self.in_queue.get() print "Received XML" if xml_text == "None": self.out_queue.put("None") break json_doc = DataParser.parse_get_state_stats_resp(xml_text) print "Made JSON" self.out_queue.put(json_doc)
def __init__(self, url='http://weixin.sogou.com/weixin', key_word=None): self.url = url self.key_word = key_word self.headers = { 'Accept': 'text/html,application/xhtml+xml.application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en,q-0.6,ja;q=0.4,zs-TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } self.session = Session() self.queue = RedisQueue() self.mysql = MySQL()
def process_exception(self, request, exception, spider): request_ip = request.meta['proxy'] invalid_ip = request_ip.split('//')[1] redis = RedisQueue('proxy_ip') redis_invalid_ip = RedisQueue('invalid_ip') if not redis.empty(): redis.remove(invalid_ip) redis_invalid_ip.put(invalid_ip) print '+++++++++++++++++++++++%s' %exception print '-----------------------removing ip from redis: %s' %invalid_ip new_ip = get_ip() proxy_para = { 'ip_port': new_ip, 'user_pass': '' } request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % (invalid_ip,proxy_para['ip_port']) redis.put(new_ip)
def process_request(self, request, spider): # proxy_ip,redis_label = self.select_ip(REDIS_NUM) redis_list = [] for i in range(REDIS_NUM): redis_list.append(RedisQueue('proxy_ip_%d' % i)) redis_label = random.choice(range(REDIS_NUM)) while redis_list[redis_label].empty(): redis_label = random.choice(range(REDIS_NUM)) proxy_ip = redis_list[redis_label].get() redis_list[redis_label].put(proxy_ip) proxy_para = {'ip_port': proxy_ip, 'user_pass': ''} request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] request.meta['redis_label'] = redis_label if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers[ 'Proxy-Authorization'] = 'Basic ' + encoded_user_pass print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para[ 'ip_port']
class DatastoreWriterWorker(): def __init__(self, in_queue_namespace): self.in_queue_namespace = in_queue_namespace self.in_queue = RedisQueue(in_queue_namespace) def run(self): while 1: json_doc = self.in_queue.get() if json_doc == "None": break print "DatastoreWriterWorker got", json_doc print print "Write to KV store, Fluentd, and MySQL" print print
class DatastoreWriterWorker: def __init__(self, in_queue_namespace): self.in_queue_namespace = in_queue_namespace self.in_queue = RedisQueue(in_queue_namespace) def run(self): while 1: json_doc = self.in_queue.get() if json_doc == "None": break print "DatastoreWriterWorker got", json_doc print print "Write to KV store, Fluentd, and MySQL" print print
def process_request_origin(self, request, spider): redis = RedisQueue('proxy_ip') if not redis.empty(): proxy_ip = redis.get() else: proxy_ip = get_ip() proxy_para = {'ip_port': proxy_ip, 'user_pass': ''} request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers[ 'Proxy-Authorization'] = 'Basic ' + encoded_user_pass print "*********************** RedisProxyMiddleware Using proxy ip: %s *****" % proxy_para[ 'ip_port'] redis.put(proxy_ip)
def __init__(self, key, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.tc_type = Config.TC_TYPE # tc type #self.item_type = q_type # item queue type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # tc queue type self.tc_queue_type = q_type # new... self._key = key # redis queue key # appendix val self.a_val = a_val # return items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = []
def __init__(self, itemtype, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.jhs_type = Config.JHS_TYPE # jhs type self.item_type = itemtype # item type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # jhs queue type self.jhs_queue_type = q_type # h:每小时 self._key = '%s_%s_%s' % (self.jhs_type,self.item_type,self.jhs_queue_type) # appendix val self.a_val = a_val # activity items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = []
# # # store the website # dbname with ip? # # sitedata_2nd_ # # # # db = dbd['runque'] # db = dbd['extracturls'] # dbd = dict() dbd['runque'] = 1 dbd['extracturls'] = 2 host = "127.0.0.1" password = '******' # first insert into the done_site.bin rq = RedisQueue(name = 'extracturls', host=host, password=password, db=dbd['extracturls']) rr = RedisQueue(name ='runque', host=host, password=password, db=dbd['runque']) print rq.qsize() print rr.qsize() #exit(0)
def __init__(self, in_queue_namespace): self.in_queue_namespace = in_queue_namespace self.in_queue = RedisQueue(in_queue_namespace)
def process_exception(self, request, exception, spider): request_ip = request.meta['proxy'] invalid_ip = request_ip.split('//')[1] redis = RedisQueue('proxy_ip') redis_invalid_ip = RedisQueue('invalid_ip') if not redis.empty(): redis.remove(invalid_ip) redis_invalid_ip.put(invalid_ip) print '+++++++++++++++++++++++%s' % exception print '-----------------------removing ip from redis: %s' % invalid_ip new_ip = get_ip() proxy_para = {'ip_port': new_ip, 'user_pass': ''} request.meta['proxy'] = "http://%s" % proxy_para['ip_port'] if proxy_para['user_pass'] is not None: encoded_user_pass = base64.encodestring(proxy_para['user_pass']) request.headers[ 'Proxy-Authorization'] = 'Basic ' + encoded_user_pass print ">>>>>>>>>>>>>>>>>>>>>>>>>>> switch %s to ip: %s *****" % ( invalid_ip, proxy_para['ip_port']) redis.put(new_ip)
def __init__(self, redis_host): self.work_queue = RedisQueue(redis_host, "inqueue")
class TCItemRedisM(MyThread): '''A class of tc Item redis queue''' def __init__(self, key, q_type, thread_num=10, a_val=None): # parent construct MyThread.__init__(self, thread_num) # thread lock self.mutex = threading.Lock() self.tc_type = Config.TC_TYPE # tc type #self.item_type = q_type # item queue type # db self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # tc queue type self.tc_queue_type = q_type # new... self._key = key # redis queue key # appendix val self.a_val = a_val # return items self.items = [] # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._tag = 'ikuai' #self._tag = 'tpent' # give up item, retry too many times self.giveup_items = [] # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._tag)) except Exception as e: Common.log('# To dial router exception: %s' % e) def push_back(self, L, v): if self.mutex.acquire(1): L.append(v) self.mutex.release() def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'item': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: Common.log('# retry too many time, no get msg:') Common.log(msg) # insert item def insertIteminfo(self, iteminfosql_list, f=False): if f or len(iteminfosql_list) >= Config.item_max_arg: if len(iteminfosql_list) > 0: self.mysqlAccess.insertTCItem(iteminfosql_list) return True return False # item sql list def crawl(self): _iteminfosql_list = [] i, M = 0, 2 n = 0 while True: try: _data = self.redisQueue.get_q(self._key) # 队列为空 if not _data: # 队列为空,退出 # info self.insertIteminfo(_iteminfosql_list, True) _iteminfosql_list = [] i += 1 if i > M: Common.log('# all get itemQ item num: %d' % n) Common.log('# not get itemQ of key: %s' % self._key) break time.sleep(10) continue n += 1 item = None obj = 'item' if self.tc_queue_type == 'spot': # 商品实例 item = Item() #_val = _data[1] _val = _data["val"] if self.a_val: _val = _val + self.a_val item.antPage(_val) # 汇聚 self.push_back(self.items, item.outSql()) # 入库 tickets = item.item_tickets if tickets and len() > 0: self.mysqlAccess.insertTCTicket(tickets) iteminfoSql = item.outSql() _iteminfosql_list.append(iteminfoSql) if self.insertIteminfo(_iteminfosql_list): _iteminfosql_list = [] else: continue # 存网页 #if item and obj != '': # _pages = item.outItemPage(obj, self.tc_queue_type) # self.mongofsAccess.insertTCPages(_pages) # 延时 time.sleep(1) except Common.NoItemException as e: Common.log('# Not item exception: %s' % e) except Common.NoPageException as e: Common.log('# Not page exception: %s' % e) except Common.InvalidPageException as e: self.crawlRetry(self._key, _data) Common.log('# Invalid page exception: %s' % e) except Exception as e: Common.log('# Unknown exception crawl item: %s' % e) Common.traceback_log() self.crawlRetry(self._key, _data) if str(e).find('Read timed out') == -1: # 重新拨号 try: self.dialRouter(4, 'item') except Exception as e: Common.log('# DailClient Exception err: %s' % e) time.sleep(10) time.sleep(random.uniform(10,30))
from RedisQueue import RedisQueue import sys import random from pymongo import MongoClient if __name__ == '__main__': db = MongoClient() exists = db.zhihu.zhihu_answers exist_owners = [] for e in exists.find(): exist_owners.append(e['owner']) print(len(exist_owners)) all_ids = [line.strip().split('\t')[0] for line in open('./user_followees.data')] candidates = list(set(all_ids) - set(exist_owners)) queue = RedisQueue('answer_queue') queue.clear() print('Count: %d' % len(candidates)) for c in candidates[0:]: queue.put(c)
#!/usr/bin/python from RedisQueue import RedisQueue import subprocess import json import base64 q = RedisQueue('messages', namespace='ansible', host='internal-redis.ovmdvp.0001.use2.cache.amazonaws.com', port=6379, db=1) while True: res = q.get() message = json.loads(res) subprocess.Popen([ "/home/ubuntu/ansible-bot/message_bridge/run_ansible_controller.sh", message['response_id'], message['playbook'], base64.b64encode(res) ])
class TCWorker: """A class of tc worker""" def __init__(self): # tc spot type self.worker_type = Config.TC_Spot # DB self.tc_type = Config.TC_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.mongofsAccess = MongofsAccess() # mongodb fs access # 抓取设置 self.crawler = TCCrawler() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = "ikuai" # self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None self.init_log(_obj, _crawl_type) def init_log(self, _obj, _crawl_type): if not Logger.logger: loggername = "other" filename = "crawler_%s" % (time.strftime("%Y%m%d%H", time.localtime(self.begin_time))) if _obj == "channel": loggername = "channel" filename = "add_%s_%s" % (_crawl_type, time.strftime("%Y%m%d%H", time.localtime(self.begin_time))) # elif _obj == 'item': Logger.config_logging(loggername, filename) # To dial router def dialRouter(self, _type, _obj): try: _module = "%s_%s" % (_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: Common.log("# To dial router exception: %s" % e) # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg["retry"] += 1 _retry = msg["retry"] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == "channel": max_time = Config.channel_crawl_retry elif _obj == "item": max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: # self.push_back(self.giveup_items, msg) Common.log("# retry too many time, no get msg:") Common.log(msg) # To crawl page def crawlPage(self, _obj, _crawl_type, _key, msg, _val): try: if _obj == "channel": self.run_channel(msg) else: Common.log("# crawlPage unknown obj = %s" % _obj) except Common.InvalidPageException as e: Common.log("# Invalid page exception: %s" % e) self.crawlRetry(_key, msg) except Common.DenypageException as e: Common.log("# Deny page exception: %s" % e) self.crawlRetry(_key, msg) # 重新拨号 try: self.dialRouter(4, "chn") except Exception as e: Common.log("# DailClient Exception err: %s" % e) time.sleep(random.uniform(10, 30)) time.sleep(random.uniform(10, 30)) except Common.SystemBusyException as e: Common.log("# System busy exception: %s" % e) self.crawlRetry(_key, msg) time.sleep(random.uniform(10, 30)) except Common.RetryException as e: Common.log("# Retry exception: %s" % e) if self.giveup_val: msg["val"] = self.giveup_val self.crawlRetry(_key, msg) time.sleep(random.uniform(20, 30)) except Exception as e: Common.log("# exception err: %s" % e) self.crawlRetry(_key, msg) Common.traceback_log() if str(e).find("Read timed out") == -1: # 重新拨号 try: self.dialRouter(4, "chn") except Exception as e: Common.log("# DailClient Exception err: %s" % e) time.sleep(random.uniform(10, 30)) def run_channel(self, msg): msg_val = msg["val"] c = Channel() c.antPage(msg_val) # self.items = c.channel_items self.run_items(c) # 并行获取商品 def run_items(self, chan): Common.log("# Items start, channel_id:%s, channel_name:%s" % (str(chan.channel_id), chan.channel_name)) # 多线程 控制并发的线程数 Common.log("# Items num: %d" % len(chan.channel_items)) if len(chan.channel_items) > Config.item_max_th: m_itemsObj = TCItemM(self._crawl_type, Config.item_max_th) else: m_itemsObj = TCItemM(self._crawl_type, len(chan.channel_items)) m_itemsObj.createthread() m_itemsObj.putItems(chan.channel_items) m_itemsObj.run() item_list = m_itemsObj.items Common.log("# find Items num: %d" % len(chan.channel_items)) Common.log("# crawl Items num: %d" % len(item_list)) giveup_items = m_itemsObj.giveup_items if len(giveup_items) > 0: Common.log("# giveup Items num: %d" % len(giveup_items)) raise Common.RetryException("# run_items: some items retry more than max times..") Common.log("# Items end, channel_id:%s, channel_name:%s" % (str(chan.channel_id), chan.channel_name)) def process(self, _obj, _crawl_type, _val=None): # self.processMulti(_obj, _crawl_type, _val) self.processOne(_obj, _crawl_type, _val) def processOne(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) i, M = 0, 20 if _obj == "channel": M = 2 n = 0 while True: if _crawl_type and _crawl_type != "": _key = "%s_%s_%s" % (self.tc_type, _obj, _crawl_type) else: _key = "%s_%s" % (self.tc_type, _obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: Common.log("# not get queue of key: %s" % _key) Common.log("# all get num of item in queue: %d" % n) break time.sleep(10) continue n += 1 try: self.crawlPage(_obj, _crawl_type, _key, _msg, _val) except Exception as e: Common.log("# exception err in process of TCWorker: %s , key: %s" % (e, _key)) Common.log(_msg) def processMulti(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) if _crawl_type and _crawl_type != "": _key = "%s_%s_%s" % (self.tc_type, _obj, _crawl_type) else: _key = "%s_%s" % (self.tc_type, _obj) try: self.crawlPageMulti(_obj, _crawl_type, _key, _val) except Exception as e: Common.log("# exception err in processMulti of TCWorker: %s, key: %s" % (e, _key)) # To crawl page def crawlPageMulti(self, _obj, _crawl_type, _key, _val): self.run_multiitems(_key, _val) # Common.log('# crawlPageMulti unknown obj = %s' % _obj) def run_multiitems(self, _key, _val): mitem = TCItemRedisM(_key, self._crawl_type, 20, _val) mitem.createthread() mitem.run() item_list = mitem.items Common.log("# crawl Items num: %d" % len(item_list))
class JHSGroupItemWorker(): '''A class of JHS group item channel worker''' def __init__(self): # jhs group item type self.worker_type = Config.JHS_GroupItem self.jhs_type = Config.JHS_TYPE # queue type # message self.message = Message() # 获取Json数据 self.jsonpage = Jsonpage() # 抓取设置 self.crawler = TBCrawler() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() # DB # mysql access self.mysqlAccess = MysqlAccess() # redis queue self.redisQueue = RedisQueue() # redis access self.redisAccess = RedisAccess() # mongodb fs access self.mongofsAccess = MongofsAccess() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = 'ikuai' #self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: print '# To dial router exception :', e def push_back_list(self, L, v): L.extend(v) def push_back_val(self, L, v): L.append(v) # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'groupitemcat': max_time = Config.json_crawl_retry elif _obj == 'groupitem': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: #self.push_back(self.giveup_items, msg) print "# retry too many time, no get:", msg def crawlPage(self, _key, msg, _val): try: if self._obj == 'groupitemcat': self.run_category(msg, _val) else: print '# crawlPage unknown obj = %s' % self._obj except Common.InvalidPageException as e: print '# Invalid page exception:',e self.crawlRetry(_key,msg) except Common.DenypageException as e: print '# Deny page exception:',e self.crawlRetry(_key,msg) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: print '# System busy exception:',e self.crawlRetry(_key,msg) time.sleep(random.uniform(10,30)) except Common.RetryException as e: print '# Retry exception:',e if self.giveup_val: msg['val'] = self.giveup_val self.crawlRetry(_key,msg) time.sleep(random.uniform(20,30)) except Exception as e: print '# exception err:',e self.crawlRetry(_key,msg) # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) Common.traceback_log() def run_category(self, msg, _val): category_val = msg["val"] refers = _val c_url,c_name,c_id = category_val print c_url,c_name,c_id page = self.crawler.getData(c_url, refers) page_val = (page,c_name,c_id) ajax_url_list = self.getAjaxurlList(page_val,c_url) if len(ajax_url_list) > 0: self.get_jsonitems(ajax_url_list) # get json ajax url def getAjaxurlList(self, page_val, refers=''): url_list = [] page, c_name, c_id = page_val p = re.compile(r'<.+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S) i = 0 for a_info in p.finditer(page): c_subNav = c_name a_url = a_info.group(1).replace('amp;','') info = a_info.group(2) m = re.search(r'<span class="l-f-tbox">(.+?)</span>',info,flags=re.S) if m: c_subNav = m.group(1).strip() a_val = (c_id,c_name,refers,c_subNav) url_list.append((a_url,refers,a_val)) i += 1 return url_list # get item json list in category page from ajax url def get_jsonitems(self, ajax_url_list): # today all items val todayall_item_val = [] # other sub nav items val item_list = [] # process ajax url list item_json_index = 0 # mongo json pages cat_pages = {} for a_url in ajax_url_list: # get json from ajax url Result_list = self.jsonpage.get_json([a_url]) # mongo page json _url,_refers,_val = a_url _c_id = _val[0] time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time)) # timeStr_jhstype_webtype_itemgroupcat_catid key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1','itemgroupcat',str(_c_id)) cat_pages[key] = '<!-- url=%s --> %s' % (_url,str(Result_list)) if Result_list and len(Result_list) > 0: item_result_valList = self.jsonpage.parser_itemjson(Result_list) if item_result_valList and len(item_result_valList) > 0: item_json_index += 1 # the first item list is all online items if item_json_index == 1: if len(item_result_valList) > 0: print '# all online items.....' todayall_item_val = item_result_valList else: self.push_back_list(item_list, item_result_valList) else: print '# not get itemjson parse val list...' if len(item_list) > 0: self.parseItems(item_list) # cat pages json for key in cat_pages.keys(): _pages = (key,cat_pages[key]) self.mongofsAccess.insertJHSPages(_pages) # 解析从接口中获取的商品数据 def parseItems(self, item_list): print '# parse Group Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 附加信息 a_val = (self.begin_time,) # 多线程 控制并发的线程数 max_th = Config.item_max_th if len(item_list) > max_th: m_itemsObj = JHSGroupItemParserM(self._crawl_type, max_th, a_val) else: m_itemsObj = JHSGroupItemParserM(self._crawl_type, len(item_list), a_val) m_itemsObj.createthread() m_itemsObj.putItems(item_list) m_itemsObj.run() _items = m_itemsObj.items self.push_back_list(self.items,_items) print '# queue item num:',len(self.items) print '# parse item num:',len(_items) print '# parse Group Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) def process(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) if _obj == 'groupitem': self.processMulti(_val) else: self.processOne(_val) def processOne(self, _val=None): i, M = 0, 10 n = 0 while True: try: if self._crawl_type and self._crawl_type != '': _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type) else: _key = '%s_%s' % (self.jhs_type, self._obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: print '# all get catQ item num:',n print '# not get catQ of key:',_key break time.sleep(10) continue n += 1 self.crawlPage(_key, _msg, _val) except Exception as e: print '# exception err in process of JHSGroupItemWorker:',e,_key,_msg def processMulti(self, _val=None): if self._crawl_type and self._crawl_type != '': _key = '%s_%s_%s' % (self.jhs_type, self._obj, self._crawl_type) else: _key = '%s_%s' % (self.jhs_type, self._obj) try: self.crawlPageMulti(_key, _val) except Exception as e: print '# exception err in processMulti of JHSGroupItemWorker: %s, key: %s' % (e,_key) # To crawl page def crawlPageMulti(self, _key, _val): if self._obj == 'groupitem': self.run_groupitem(_key, _val) else: print '# crawlPageMulti unknown obj = %s' % self._obj def run_groupitem(self, _key, _val): m_itemQ = JHSGroupItemQM(self._obj, self._crawl_type, 20, _val) m_itemQ.createthread() m_itemQ.run() item_list = m_itemQ.items print '# crawl Items num: %d' % len(item_list) # 删除redis数据库过期商品 def delItem(self, _items): for _item in _items: keys = [self.worker_type, _item["item_juId"]] item = self.redisAccess.read_jhsitem(keys) if item: end_time = item["end_time"] now_time = Common.time_s(self.begin_time) # 删除过期的商品 if now_time > end_time: self.redisAccess.delete_jhsitem(keys) # 把商品信息存入redis数据库中 def putItemDB(self, _items): for _item in _items: # 忽略已经存在的商品ID keys = [self.worker_type, _item["item_juId"]] if self.redisAccess.exist_jhsitem(keys): continue # 将商品基础数据写入redis item_val = self.message.itemInfo(_item["r_val"]) val = self.message.itemMsg(item_val) self.redisAccess.write_jhsitem(keys, val) # 更新商品信息 def updateItem(self, _item): keys = [self.worker_type, _item["item_juId"]] item = self.redisAccess.read_jhsitem(keys) if item: item_val = self.message.itemParseInfo(_item["r_val"]) c = False if item["start_time"] != item_val["start_time"]: item["start_time"] = item_val["start_time"] c = True if item["end_time"] != item_val["end_time"]: item["end_time"] = item_val["end_time"] c = True if c: self.redisAccess.write_jhsitem(keys, item) # 查找新商品 def selectNewItems(self, _items): new_items = [] for _item in _items: keys = [self.worker_type, _item["item_juId"]] if self.redisAccess.exist_jhsitem(keys): self.updateItem(_item) continue new_items.append(_item["val"]) return new_items def scanEndItems(self): val = (Common.time_s(self.crawling_time),) _items = self.mysqlAccess.selectJhsGroupItemEnd(val) end_items = [] # 遍历商品 for _item in _items: item_juid = _item[0] end_items.append({"item_juId":str(item_juid)}) print '# del item nums:',len(end_items) # 删除已经结束的商品 self.delItem(end_items) def scanEndItemsLasthour(self): val = (Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -2),Common.add_hours(self.crawling_time, -1)) _items = self.mysqlAccess.selectJhsGroupItemEndLastOneHour(val) end_items = [] # 遍历商品 for _item in _items: item_juid = _item[0] end_items.append({"item_juId":str(item_juid)}) print '# del item nums for last hour end:',len(end_items) # 删除已经结束的商品 self.delItem(end_items) def scanAliveItems(self): # 到结束时间后的一个小时 val = (Common.time_s(self.crawling_time), Common.add_hours(self.crawling_time, -1)) # 查找已经开团但是没有结束的商品 _items = self.mysqlAccess.selectJhsGroupItemAlive(val) print "# hour all item nums:",len(_items) return _items def scanNotEndItems(self): val = (Common.time_s(self.crawling_time),) # 查找没有结束的商品 _items = self.mysqlAccess.selectJhsGroupItemNotEnd(val) i = 1 for _item in _items: print i item_juid = str(_item[1]) keys = [self.worker_type, item_juid] item = self.redisAccess.read_jhsitem(keys) print item #_new_item = {"crawling_time":item["crawling_time"],"item_juid":item["item_juId"],"groupcat_id":item["item_groupCatId"],"groupcat_name":item["item_groupCatName"],"item_ju_url":item["item_ju_url"],"item_juname":item["item_juName"],"item_id":item["item_id"],"start_time":item["start_time"],"end_time":item["end_time"]} #self.redisAccess.write_jhsitem(keys, _new_item) i += 1 def scanCategories(self): category_list = self.mysqlAccess.selectJhsGroupItemCategory() return category_list
import urllib2 from RedisQueue import RedisQueue redis = RedisQueue('jandan3') def user_agent(url): req_header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0'} req_timeout = 20 req = urllib2.Request(url,None,req_header) page = urllib2.urlopen(req,None,req_timeout) html = page return html while not redis.empty(): down_url = redis.get() data = user_agent(down_url).read() with open('D:/Python/picture'+'/'+down_url[-11:],'wb')as code: code.write(data) print down_url
class JHSWorker(): '''A class of jhs worker''' def __init__(self): # jhs brand type self.worker_type = Config.JHS_Brand # DB self.jhs_type = Config.JHS_TYPE # queue type self.mysqlAccess = MysqlAccess() # mysql access self.redisQueue = RedisQueue() # redis queue self.redisAccess = RedisAccess() # redis db self.mongofsAccess = MongofsAccess() # mongodb fs access # 获取Json数据 self.jsonpage = Jsonpage() # 抓取设置 self.crawler = TBCrawler() # 页面模板解析 self.brand_temp = JHSBrandTEMP() # message self.message = Message() # 抓取时间设定 self.crawling_time = Common.now() # 当前爬取时间 self.begin_time = Common.now() self.begin_date = Common.today_s() self.begin_hour = Common.nowhour_s() def init_crawl(self, _obj, _crawl_type): self._obj = _obj self._crawl_type = _crawl_type # dial client self.dial_client = DialClient() # local ip self._ip = Common.local_ip() # router tag self._router_tag = 'ikuai' #self._router_tag = 'tpent' # items self.items = [] # giveup items self.giveup_items = [] # giveup msg val self.giveup_val = None # To dial router def dialRouter(self, _type, _obj): try: _module = '%s_%s' %(_type, _obj) self.dial_client.send((_module, self._ip, self._router_tag)) except Exception as e: print '# To dial router exception :', e # To crawl retry def crawlRetry(self, _key, msg): if not msg: return msg['retry'] += 1 _retry = msg['retry'] _obj = msg["obj"] max_time = Config.crawl_retry if _obj == 'cat': max_time = Config.json_crawl_retry elif _obj == 'act': max_time = Config.act_crawl_retry elif _obj == 'item': max_time = Config.item_crawl_retry if _retry < max_time: self.redisQueue.put_q(_key, msg) else: #self.push_back(self.giveup_items, msg) print "# retry too many time, no get:", msg # To crawl page def crawlPage(self, _obj, _crawl_type, _key, msg, _val): try: if _obj == 'cat': if _crawl_type == 'home' or _crawl_type == 'homeposition': self.run_cat_home(msg, _val) else: self.run_cat(msg, _val) elif _obj == 'act': self.run_act(msg) elif _obj == 'item': self.run_item(msg, _val) else: print '# crawlPage unknown obj = %s' % _obj except Common.InvalidPageException as e: print '# Invalid page exception:',e self.crawlRetry(_key,msg) except Common.DenypageException as e: print '# Deny page exception:',e self.crawlRetry(_key,msg) # 重新拨号 try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) time.sleep(random.uniform(10,30)) except Common.SystemBusyException as e: print '# System busy exception:',e self.crawlRetry(_key,msg) time.sleep(random.uniform(10,30)) except Common.RetryException as e: print '# Retry exception:',e if self.giveup_val: msg['val'] = self.giveup_val self.crawlRetry(_key,msg) time.sleep(random.uniform(20,30)) except Exception as e: print '# exception err:',e self.crawlRetry(_key,msg) # 重新拨号 if str(e).find('Read timed out') == -1: try: self.dialRouter(4, 'chn') except Exception as e: print '# DailClient Exception err:', e time.sleep(random.uniform(10,30)) Common.traceback_log() def run_cat_home(self, msg, _val): msg_val = msg["val"] _url, refers = msg_val print '# brand home:',_url page = self.crawler.getData(_url, refers) # save to mongo # timeStr_jhstype_webtype_obj_crawltype time_s = time.strftime("%Y%m%d%H", time.localtime(self.crawling_time)) key = '%s_%s_%s_%s_%s' % (time_s,Config.JHS_TYPE,'1',self._obj,self._crawl_type) p_content = '<!-- url=%s --> %s' % (_url,page) self.mongofsAccess.insertJHSPages((key,p_content)) c_url_val_list = self.brand_temp.temp(page) for c_url_val in c_url_val_list: c_url, c_name, c_id = c_url_val self.items.append((Common.fix_url(c_url),c_id,c_name,Config.ju_brand_home,Config.JHS_Brand)) if self._crawl_type == 'homeposition': top_acts = self.brand_temp.activityTopbrandTemp(page) print top_acts self.save_top_acts(top_acts) def save_top_acts(self, top_acts): if top_acts: for key in top_acts.keys(): act = top_acts[key] c_time, act_id, act_name, act_position, act_url, f_id, f_name, sub_nav = Common.now(), -1, '', -1, '', 0, '', '' c_date, c_hour = time.strftime("%Y-%m-%d", time.localtime(c_time)), time.strftime("%H", time.localtime(c_time)) if act.has_key('act_id'): act_id = act["act_id"] if act.has_key('position'): act_position = act["position"] if act.has_key('url'): act_url = act["url"] if act.has_key('datatype'): f_name = act["datatype"] val = (Common.time_s(c_time),act_id,act_name,act_url,Config.JHS_Brand,sub_nav,act_position,f_id,f_name,'',c_date,c_hour) self.mysqlAccess.insertJhsActPosition_hour(val) def run_cat(self, msg, _val): msg_val = msg["val"] c_url, c_id, c_name, refers, pagetype = msg_val print '# category',c_name,c_id if pagetype == Config.JHS_Brand: a_val = (c_id, c_name) self.get_actjson(c_url, refers, a_val, _val, pagetype) elif pagetype == Config.JHS_GroupItem: self.get_cat_jsons(c_url, c_id, c_name, refers, _val, pagetype) else: print '# not get category pagetype...' def get_cat_jsons(self, c_url, c_id, c_name, refers, _val, pagetype): a_val = (c_id, c_name) page = self.crawler.getData(c_url, refers) page_val = (page,c_id,c_name) ajax_url_list = self.getAjaxurlList(page_val) if len(ajax_url_list) > 0: # process ajax url list for url_val in ajax_url_list: c_url,c_subNav = url_val self.get_actjson(c_url, refers, a_val, _val, pagetype, c_subNav) def get_actjson(self, c_url, refers, a_val, _val, pagetype, c_subNav=''): if self._crawl_type == 'position': _val = (pagetype,c_subNav) + _val Result_list = self.jsonpage.get_jsonPage(c_url,refers,a_val) if Result_list and len(Result_list) > 0: # parser act result act_valList = self.jsonpage.parser_brandjson(Result_list,_val) if act_valList and len(act_valList) > 0: print '# get brand act num:',len(act_valList) self.items.extend(act_valList) else: print '# not get brandjson parse val list...' # get json ajax url def getAjaxurlList(self, page_val): url_list = [] page, c_id, c_name = page_val p = re.compile(r'<.+?id="(.+?)".+?data-ajaxurl="(.+?)".+?>(.+?)</div>',flags=re.S) i = 0 for a_info in p.finditer(page): c_subNav = '' f_id = a_info.group(1) a_url = a_info.group(2).replace('amp;','') info = a_info.group(3) m = re.search(r'<span class="l-f-tbox">(.+?)</span>', info, flags=re.S) if m: c_subNav = m.group(1).strip() if c_subNav == '': m = re.search(r'<td.+?data-target="%s".+?>(.+?)</td>' % f_id, page, flags=re.S) if m: c_subNav = re.sub(r'<.+?>','',m.group(1)) #url_list.append((a_url,refers,a_val)) url_list.append((a_url,c_subNav)) i += 1 return url_list # ACT queue def run_act(self, msg): # 默认数据 msg_val = msg["val"] print '# act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) act_obj = None if self._crawl_type == 'main': act_obj = JHSAct() act_obj.antPageMain(msg_val) elif self._crawl_type == 'check': act_obj = JHSAct() act_obj.antPageCheck(msg_val) elif self._crawl_type == 'position': act_obj = JHSAct() act_obj.antPageParser(msg_val) print '# act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if self._crawl_type == 'position': brandact_id,brandact_name,brandact_url,brandact_sign,brandact_status,val = act_obj.outTupleForPosition() if int(brandact_sign) != 3: if act_obj.brandact_starttime and act_obj.brandact_starttime != 0.0 and 1 >= Common.subTS_hours(int(float(act_obj.brandact_starttime)/1000), self.crawling_time): print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) self.mysqlAccess.insertJhsActPosition_hour(val) elif brandact_status != '' and brandact_status != 'blank': print '# insert activity position, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) self.mysqlAccess.insertJhsActPosition_hour(val) else: act_keys = [self.worker_type, str(act_obj.brandact_id)] prev_act = self.redisAccess.read_jhsact(act_keys) # 是否需要抓取商品 if act_obj and act_obj.crawling_confirm != 2: # 保存的活动信息 self.putActDB(act_obj, prev_act) # 活动中的商品 items_list = [] # 只取非俪人购商品 if int(act_obj.brandact_sign) != 3: if act_obj.crawling_confirm == 0: #更新马上开团活动中商品位置 self.update_actItems_position(act_obj) # 多线程抓商品 items_list = self.run_actItems(act_obj, prev_act) else: print '# ladygo activity id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) #print '# pro act start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 处理活动信息 #self.procAct(act_obj, prev_act, items_list) # 处理活动redis信息 self.procActRedis(act_obj, prev_act, items_list) #print '# pro act end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) else: self.update_startact(act_obj, prev_act) print '# Already start activity, id:%s name:%s'%(act_obj.brandact_id, act_obj.brandact_name) # 更新开团后活动 def update_startact(self, act, prev_act): if act.brandact_endtime and act.brandact_endtime != 0.0: end_time_s = Common.time_s(float(act.brandact_endtime)/1000) if prev_act and end_time_s != prev_act['end_time']: prev_act['end_time'] = end_time_s # redis keys = [self.worker_type, str(act.brandact_id)] self.redisAccess.write_jhsact(keys, prev_act) self.mysqlAccess.updateJhsActEndtime((end_time_s,str(act.brandact_id))) #更新马上开团活动中商品位置 def update_actItems_position(self, act): update_val_list = [] act_id = act.brandact_id for item in act.brandact_itemVal_list: if str(item[7]) != '': update_val_list.append((str(item[7]),str(act_id),item[4])) self.mysqlAccess.updateJhsItemPosition(update_val_list) # 并行获取品牌团商品 def run_actItems(self, act, prev_act): print '# act items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 需要抓取的item item_val_list = [] # 过滤已经抓取过的商品ID列表 item_ids = act.brandact_itemids if prev_act: prev_item_ids = prev_act["item_ids"] item_ids = Common.diffSet(item_ids, prev_item_ids) # 如果已经抓取过的活动没有新上线商品,则退出 if len(item_ids) == 0: print '# Activity no new Items' print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name return None for item in act.brandact_itemVal_list: if str(item[6]) in item_ids or str(item[7]) in item_ids: item_val_list.append(item) else: item_val_list = act.brandact_itemVal_list # 如果活动没有商品, 则退出 if len(item_ids) == 0: print '# run_brandItems: no items in activity, act_id=%s, act_name=%s' % (act.brandact_id,act.brandact_name) return None print '# Activity Items crawler start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name # 多线程 控制并发的线程数 if len(item_val_list) > Config.item_max_th: m_itemsObj = JHSItemM('main', Config.item_max_th) else: m_itemsObj = JHSItemM('main', len(item_val_list)) m_itemsObj.createthread() m_itemsObj.putItems(item_val_list) m_itemsObj.run() item_list = m_itemsObj.items print '# Activity find new Items num:', len(item_val_list) print '# Activity crawl Items num:', len(item_list) giveup_items = m_itemsObj.giveup_items if len(giveup_items) > 0: print '# Activity giveup Items num:',len(giveup_items) raise Common.RetryException('# run_actItems: actid:%s actname:%s some items retry more than max times..'%(str(act.brandact_id),str(act.brandact_name))) print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), act.brandact_id, act.brandact_name return item_list # To merge activity def mergeAct(self, act, prev_act): if prev_act: # 合并本次和上次抓取的商品ID列表 prev_item_ids = prev_act["item_ids"] act.brandact_itemids = Common.unionSet(act.brandact_itemids, prev_item_ids) # 取第一次的活动抓取时间 act.crawling_time = Common.str2timestamp(prev_act["crawl_time"]) if not act.brandact_name or act.brandact_name == '': act.brandact_name = prev_act["act_name"] if not act.brandact_url or act.brandact_url == '': act.brandact_url = prev_act["act_url"] if not act.brandact_position or str(act.brandact_position) == '0': act.brandact_position = prev_act["act_position"] if not act.brandact_enterpic_url or act.brandact_enterpic_url == '': act.brandact_enterpic_url = prev_act["act_enterpic_url"] if not act.brandact_remindNum or str(act.brandact_remindNum) == '0': act.brandact_remindNum = prev_act["act_remindnum"] if not act.brandact_coupons or act.brandact_coupons == []: act.brandact_coupon = prev_act["act_coupon"] act.brandact_coupons = prev_act["act_coupons"].split(Config.sep) if not act.brandact_starttime or act.brandact_starttime == 0.0: act.brandact_starttime = Common.str2timestamp(prev_act["start_time"]) if not act.brandact_endtime or act.brandact_endtime == 0.0: act.brandact_endtime = Common.str2timestamp(prev_act["end_time"]) if not act.brandact_other_ids or act.brandact_other_ids == '': act.brandact_other_ids = prev_act["_act_ids"] # To put act db def putActDB(self, act, prev_act): # 预热信息 if self._crawl_type == 'main': self.mysqlAccess.insertJhsActComing(act.outSql()) # redis self.mergeAct(act, prev_act) if self._crawl_type == 'main': # mysql if prev_act: print '# update activity, id:%s name:%s'%(act.brandact_id, act.brandact_name) self.mysqlAccess.updateJhsAct(act.outSqlForUpdate()) else: print '# insert activity, id:%s name:%s'%(act.brandact_id, act.brandact_name) self.mysqlAccess.insertJhsAct(act.outSql()) # mongo # 存网页 _pages = act.outItemPage(self._crawl_type) self.mongofsAccess.insertJHSPages(_pages) # To process activity in redis def procActRedis(self, act, prev_act, items_list): # 活动抓取的item ids act.brandact_itemids = [] if items_list: for item in items_list: # item juid if str(item[1]) != '': act.brandact_itemids.append(str(item[1])) # item id if str(item[10]) != '': act.brandact_itemids.append(str(item[10])) # redis self.mergeAct(act, prev_act) keys = [self.worker_type, str(act.brandact_id)] val = act.outTupleForRedis() self.redisAccess.write_jhsact(keys, val) # To process activity def procAct(self, act, prev_act, items_list): # 活动抓取的item ids act.brandact_itemids = [] if items_list: for item in items_list: # item juid if str(item[1]) != '': act.brandact_itemids.append(str(item[1])) # item id if str(item[10]) != '': act.brandact_itemids.append(str(item[10])) # 将抓取的活动信息存入redis self.putActDB(act, prev_act) # ITEM queue def run_item(self, msg, _val): # 默认数据 msg_val = msg["val"] brandact_id, brandact_name, item_val_list = msg_val print '# Activity Items start:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name # 多线程 控制并发的线程数 max_th = Config.item_max_th if len(item_val_list) > max_th: m_itemsObj = JHSItemM(self._crawl_type, max_th, _val) else: m_itemsObj = JHSItemM(self._crawl_type, len(item_val_list), _val) m_itemsObj.createthread() m_itemsObj.putItems(item_val_list) m_itemsObj.run() item_list = m_itemsObj.items print '# Activity Items num:', len(item_val_list) print '# Activity crawl Items num:', len(item_list) giveup_items = m_itemsObj.giveup_items if len(giveup_items) > 0: print '# Activity giveup Items num:',len(giveup_items) self.giveup_val = (brandact_id, brandact_name, giveup_items) raise Common.RetryException('# run_item: actid:%s actname:%s some items retry more than max times..'%(str(brandact_id),str(brandact_name))) print '# Activity Items end:',time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), brandact_id, brandact_name def process(self, _obj, _crawl_type, _val=None): self.init_crawl(_obj, _crawl_type) i, M = 0, 20 if _obj == 'cat': M = 10 n = 0 while True: if _crawl_type and _crawl_type != '': _key = '%s_%s_%s' % (self.jhs_type,_obj,_crawl_type) else: _key = '%s_%s' % (self.jhs_type,_obj) _msg = self.redisQueue.get_q(_key) # 队列为空 if not _msg: i += 1 if i > M: print '# not get queue of key:',_key,time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) print '# all get num of item in queue:',n break time.sleep(10) continue n += 1 try: self.crawlPage(_obj, _crawl_type, _key, _msg, _val) except Exception as e: print '# exception err in process of JHSWorker:',e,_key,_msg # 删除redis数据库过期活动 def delAct(self, _acts): i = 0 for _act in _acts: keys = [self.worker_type, str(_act[0])] item = self.redisAccess.read_jhsact(keys) if item: end_time = item["end_time"] now_time = Common.time_s(self.crawling_time) # 删除过期的活动 if now_time > end_time: i += 1 self.redisAccess.delete_jhsact(keys) print '# delete acts num:',i def delItem(self, _items): i = 0 for _item in _items: keys = [self.worker_type, str(_item[0])] item = self.redisAccess.read_jhsitem(keys) if item: end_time = item["end_time"] now_time = Common.time_s(self.crawling_time) # 删除过期的商品 if now_time > end_time: i += 1 self.redisAccess.delete_jhsitem(keys) print '# delete items num:',i # 查找结束的活动 def scanEndActs(self, val): _acts = self.mysqlAccess.selectJhsActEnd(val) print '# end acts num:',len(_acts) # 删除已经结束的活动 self.delAct(_acts) # 查找结束的商品 def scanEndItems(self, val): _items = self.mysqlAccess.selectJhsItemEnd(val) print '# end items num:',len(_items) # 删除已经结束的商品 self.delItem(_items) # acts redis def actsRedis(self): _acts = self.mysqlAccess.selectActsRedisdata() print '# acts num:',len(_acts) i = 0 for _act in _acts: act_id = _act[2] #_itemids = self.mysqlAccess.selectItemsids(str(act_id)) #item_ids = [] #for _itemid in _itemids: # item_ids.append(str(_itemid[0])) # item_ids.append(str(_itemid[1])) #act_val = _act + (item_ids,) #print act_val #keys = [self.worker_type, str(act_id)] #print keys #if self.redisAccess.exist_jhsact(keys): #act_redis = self.redisAccess.read_jhsact(keys) #if len(act_redis) != 15: # print act_redis # i += 1 #print self.redisAccess.read_jhsact(keys) #self.redisAccess.delete_jhsact(keys) #self.redisAccess.write_jhsact(keys, act_val) #i += 1 #break print '# redis acts num:',i # items redis def itemsRedis(self): _items = self.mysqlAccess.selectItemRedisdata() print '# items num:', len(_items) i = 0 #for _item in _items: #msg = self.message.jhsitemMsg(_item) #print msg #keys = [self.worker_type, str(_item[0])] #print keys #if self.redisAccess.exist_jhsitem(keys): #print self.redisAccess.read_jhsitem(keys) #self.redisAccess.delete_jhsitem(keys) #self.redisAccess.write_jhsitem(keys, msg) #i += 1 #break print '# redis items num:',i
#/usr/bin/python #coding=utf-8 import urllib3 import base64 import time import threading from queue_config import master_host, author_login from RedisQueue import RedisQueue redis_conn = {'host': master_host[0] ,'port':master_host[1]} q = RedisQueue('account_login', **redis_conn) http = urllib3.PoolManager(num_pools=50) def worker(value): params = {} params['account_login'] = base64.encodestring(value) r = http.request('POST', author_login, params) #服务器失败,重新压回队列 if r.status != 200: q.put(value) #IP白名单验证失败,重新压回队列 if r.data['status'] == 10002: q.put(value) print r.data while 1: # time.sleep(1);