Beispiel #1
0
def init_task(mq, link):
    lines = csv.reader(file(link, 'rb'))
    nd = {}
    cd = {}
    for line in lines:
        url = line[-1]
        if not cd.has_key(url):
            cd[url] = 0
        else:
            continue
        filename = 'japan' + '/' + line[2]
        if not nd.has_key(filename):
            nd[filename] = 0
        else:
            nd[filename] += 1
            filename = filename + '_' + str(nd[filename])
        task = etask([url, filename, 0]).get_task()
        mq._cqs.put(task)

    while 1:
        try:
            task = mq._cqs.get(True, 10)
        except Exception, e:
            logger.info('...get cqs task out of time...')
        else:
            logger.info('...run a task...')
            result = paser_page.apply_async(args=[task],
                                            queue='machine1',
                                            routing_key='machine1')
            #result = result.get()
            #print result
            try:
                mq._cqx.put(result)
            except Exception, e:
                logger.error('...put cqx result error...')
Beispiel #2
0
def getline(url):
    mc = MC()
    p = get_proxy('XXXX')
    if not p:
	logger.error('get proxy error ... %s',str(p))
		
    mc.set_proxy(p)
    page = mc.req( 'get', url, html_flag = True)
    return page
    def product_pooltask(self,poolres):
	pooltask= poolres
	spi_lst = pooltask['spi_act']
	exp_lst = pooltask['exp_act']
	oldinfo = pooltask['oldinfo']
	newinfo = pooltask['newinfo']	
	
	file_new = []
	for file in newinfo:
	    on = oldinfo[1]
	    n = file[1]
	    if self.__name_dict.has_key(on+n):
		self.__name_dict[on+n] += 1
	    else:
		self.__name_dict[on+n]  = 0
	    if self.__name_dict[on+n] != 0:
		n = n+'_'+str(self.__name_dict[on+n])

	    newurl = file[0]
	    oldurl = oldinfo[0]
	    nus = newurl.split('/')
	    if 'http' in newurl:
		pass
	    elif nus[0] == '..':
	    	for fl in nus:
		    if fl == '..':
		    	nus=nus[1:]#remove('..')
		    	oldurl = '/'.join(oldurl.split('/')[:-2])+'/'
	    	    else:
		    	break
	    	newurl = oldurl+'/'.join(nus)
	    else:
		oldurl = '/'.join(oldurl.split('/')[:-1])+'/'
		if nus[0] == '.':
		    newurl = oldurl+'/'.join(nus[1:])
		else:
		    newurl = oldurl+'/'.join(nus)
	    if oldinfo == []:
		file_new.append([newurl,n,file[2]])	
	    else:
	    	file_new.append([newurl,oldinfo[1]+'__'+n,file[2]])
	    
	for info in file_new:
	    if info[2] == -1:
		continue
	    newtask = {}
	    newtask['info'] = info
	    if type(info[0]) == type(()):
	    	for i in range(len(info[0])):
		    spi_lst[info[2]]['post_data'].replace('miaoji@'+str(i),info[0][i])
	    newtask['spi_act'] = spi_lst
	    newtask['exp_act'] = exp_lst
	    try:
	    	self._cqs.put(newtask)
	    except Exception, e:
	    	logger.error('(file:%s)requestqueue...%s',__name__,traceback.format_exc(e))
    def load_by_mc(self, spi, url, name):
        refer = spi['refer']
        mode = spi['mode']
        spi_url = spi['spi_url']
        post_type = spi['post_type']
        post_data = spi['post_data']
        post_url = spi['post_url']
        isproxy = spi['isproxy']
        debug = spi['debug']

        key = name.split('__')[0]
        if self.objmc.has_key(key):
            mc = self.objmc[key]
        else:
            mc = MC()
            mc.set_debug(debug)
            self.objmc[key] = mc

        if isproxy != '':
            p = get_proxy(source='citytraffic')
            if not p:
                logger.error('get proxy error ... %s', str(p))
            else:
                logger.info('this proxy is ... %s', str(p))
                mc.set_proxy(p)
        if refer != '':
            mc.add_referer(self.refer)
        if spi_url != '':
            mc.req(self.mode, self.spi_url)

        try:
            if mode == 'post':
                page = mc.req(mode,
                              post_url,
                              paras=post_data,
                              paras_type=post_type,
                              html_flag=True)
            else:
                page = mc.req(mode, url, html_flag=True)
        except Exception, e:
            #traceback.print_exc(e)
            logger.error('load by mc ...<-!error::%s!-> <-!proxy::%s!->',
                         traceback.format_exc(e), str(p))
    def thread_handle(self):		
	logger.info('...thread_handle start...')
	while True:
	    
	    try:
		res = self._cqx.get(True, self.__resq_timeout)
	    except Exception ,e:
		'''队列开始空了'''
		logger.info('result queue cqx is empty')
	    else:
		#logger.info('running task')
		try:
			ged = res.get(timeout = 1)
		except:
			try:
				self._cqx.put(res)
			except:
				logger.error('time out task put cqx again error ...')
		else:
			self.res_handle(ged)
    def req(self,
            method,
            url_base,
            paras={},
            paras_type=1,
            html_flag=False,
            time_out=60):
        if method.lower() == 'get':
            url = url_base + urllib.urlencode(paras)
        elif method.lower() == 'post':
            url = url_base
        else:
            logger.error('req, wrong method(post or get)')
            sys.exit(-1)

        html = ''
        try:
            request = urllib2.Request(url)
            request.add_headers = [(
                'User-agent',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0'
            )]
            if self.header != {}:
                for key, value in self.header.items():
                    request.add_header(key, value)
            self.header = {}
            if self.p != '':
                request.set_proxy(self.p, 'http')
                request.set_proxy(self.p, 'https')
            if self.refer != {}:
                for key, value in self.refer.items():
                    request.add_header(key, value)
            self.refer = {}

            opener = urllib2.build_opener(
                urllib2.HTTPCookieProcessor(self.cookie), self.httpHandler)
            urllib2.install_opener(opener)
            if method.lower() == 'get':
                resp = urllib2.urlopen(request, timeout=time_out)
        # resp = self.br.open(url, timeout=time_out)
            else:
                if paras_type == 1:
                    paras = json.dumps(paras)
                elif paras_type == 0:
                    paras = urllib.urlencode(paras)
                elif paras_type == 2:
                    pass
                else:
                    logger.error('req, wrong paras type( 0 or 1)')
                #resp = self.br.open(url, paras, timeout=time_out)

#安装opener,此后调用urlopen()时都会使用安装过的opener对象

                resp = opener.open(request, paras)
            if html_flag:
                html = resp.read()

        except Exception, e:
            if self.debug:
                logger.error(traceback.format_exc(e))
    def load_by_request(self, spi, url, name):
        refer = spi['refer']
        mode = spi['mode']
        spi_url = spi['spi_url']
        post_type = spi['post_type']
        post_data = spi['post_data']
        isproxy = spi['isproxy']
        debug = spi['debug']

        p = get_proxy('XXXX')
        if not p:
            logger.error('get proxy error ... %s', str(p))

        key = name.split('__')[0]
        if self.objreq.has_key(key):
            req = self.objreq[key]
        else:
            req = urllibCrawler()
            self.objreq[key] = req

        req.set_debug(debug)
        if isproxy != '' and p:
            req.set_proxy(isproxy)
        if refer != '':
            req.add_referer(self.refer)
        if spi_url != '':
            req.req(self.mode, self.spi_url)
        try:
            if mode == 'post':
                page = req.req(mode,
                               post_url,
                               paras=post_data,
                               paras_type=post_type,
                               html_flag=True)
            else:
                page = req.req(mode, url, html_flag=True)
        except Exception, e:
            #traceback.print_exc(e)
            logger.error('loading by urllib :: %s', traceback.format_exc(e))
Beispiel #8
0
def paser_page( kwds):
	
	realtime = []
	spacetime = []
	lname = []
	sname = []
	type = []
	url_res = []
	coordinates = []
	dl = download()
	task = kwds
	key_l = {
		'lname':[],\
		'sname':[],\
		'type':[],\
		'coordinates':[],\
		'realtime':[],\
		'spacetime':[],\
		'url_res':[]\
		}

	task_url = task['info'][0]	
	path     = task['info'][1]	
	filename = path.split('/')[0]+'/'+md5(path.split('/')[1])
	city_name = path.split('/')[0]
	step     = task['info'][2]	
	exp_act  = task['exp_act'][step]
	spi_act  = task['spi_act'][step]
	temp = dl.temp
	count = 0
	flag  = True
	p = '0.0.0.0:0'
	for key, value in exp_act.items():
	    if count == 0:
		i = 0
		while key_l[key] == []:
		    i += 1
		    if i > MAX:
	    		logger.info('a task fail ::%s',str(task['info']))
			flag = False
			break
		    try:
		    	if os.path.exists(filename) and Islocal:
		    	    with open(filename,'r') as file :	
			    	page = file.read()
			else:
		    	    if spi_act['way'].lower() == 'req':
		            	page,p = dl.load_by_request(spi_act,task_url,filename)
		    	    elif spi_act['way'].lower() == 'mc':
		            	page,p = dl.load_by_mc(spi_act,task_url,filename)
		        exec(key+'='+value)
		    except Exception, e:
	    		logger.error('a error spider :: <-!error::%s!-> <-!task::%s!-> <-!proxy::%s!->',traceback.format_exc(e),str(task['info']),str(p))
		    else:
		    	logger.info('task success :: <-!task::%s!-> <-!proxy::%s!->',str(task['info']),str(p))

		    if key == 'lname':
		    	key_l[key] = lname
		    elif key == 'sname':
			key_l[key] = sname
		    elif key == 'type':
			key_l[key] = type
		    elif key == 'spacetime':
			key_l[key] = spacetime
		    elif key == 'realtime':
			key_l[key] = realtime
		    elif key == 'coordinates':
			key_l[key] = coordinates
		    elif key == 'url_res':
			key_l[key] = url_res
	    else:
		if i >= MAX:
		    break
		try:
		    exec(key+'='+value)
		except Exception, e:
	    	    logger.error('not first validate error :: <-!error::%s!-> <-!task::%s!->',traceback.format_exc(e),str(task['info']))