def Get_result_Get(self): item = self.result_get.pop() item_list = copy.deepcopy(item) INFO("[Schedule] [result] get result !!") return item_list
def start_parser(): global num_threading global locate global headers parser = SafeConfigParser() parser.read("config.ini") num_threading = parser.get('system', 'num_threading') INFO("[Config_paser] num_threading = {}".format(num_threading)) locate = parser.get('system', 'locate') INFO("[Config_paser] locate path = {}".format(locate)) headers = parser.get('system', 'headers') INFO("[Config_paser] headers = {}".format(headers))
def engine_start(self, res=None): if res is None: request_object = spider.start_request() else: request_object = res while True: try: res_object = request_object.next() if (res_object.method == "GET"): schedule.AddTodo_Get(res_object) INFO("[engine_Manager] send http for get!!") elif (res_object.method == "POST"): schedule.AddTodo_Post(res_object) INFO("[engine_Manager] send http for post!!") elif (res_object.method == "DOWNLOAD"): schedule.PutToDownload(res_object) INFO("[engine_Manager] send download message!!") except StopIteration: INFO("[engine] [senToSchedule] generation is empty") break if res_object.method == "GET": self.thread.add_func_get() self.thread.start() self.thread.waitForallThreadcompelete() elif res_object.method == "POST": self.thread.add_func_post() self.thread.start() self.thread.waitForallThreadcompelete() elif res_object.method == "DOWNLOAD": self.download.GetTodown() self.download.start() self.download.waitForallThreadcompelete() self._GetfromSchedule()
def PutToDownload(self, object): if not isinstance(object, Request): WARNING("[Schedule] input incorrect download params, stop!!") obj = object url = obj.url if url is None: WANRING("[Schedule] url is empty ?? stop schedule!!!") return else: INFO("[Schedule] The url to download is {}".format(url)) download_type = obj.download_type if download_type is None: WANRING("[Schedule] download_type is None, stop schedule") method = obj.method if method != "DOWNLOAD": WARNING( "[Schedule] method is not 'DOWNLOAD' , switch method to download" ) method == "DOWNLOAD" filename = obj.filename if filaname is None: INFO("[Schedule] there is not exists filename") else: INFO("[Schedule] [download] filename is {}".format(filename)) item = [] item.append(url) item.append(download_type) item.append(method) item.append(filename) self.download_list.append(item)
def AddTodo_Get(self, object): if not isinstance(object, Request): WARNING("[Schedule] [GET]the wrong params!!") return obj = object item = [] if obj.url is None: WARNING("[Schedule] [GET] without url ??") return item.append(obj.url) item.append(obj.method) item.append(obj.headers) item.append(obj.callback) INFO("[Schedule] [GET] url = {}, method = {}, hearders = {}".format( obj.url, obj.method, obj.headers)) self.Works_get.append(item)
def run(self): while (not self.workQueue.empty()): res = self.workQueue.get(False) print res.qsize() while not res.empty(): list = res.get() method = list[1] if (method == "GET"): url = list[0] method = list[1] headers = list[2] callback = list[3] item = [] response = self.get(url=url, method=method, headers=headers).content final_res = etree.HTML(response.lower().decode("utf-8")) item.append(final_res) item.append(callback) schedule.Putresult_Get(item) elif (method == "POST"): url = list[0] method = list[1] request = list[2] headers = list[3] callback = list[4] item = [] response = self.post(url=url, method=method, request=request, headers=headers) final_res = etree.HTML(response.lower.decode("utf-8")) INFO('final_res = {}'.format(final_res)) item.append(final_res) item.append(callback) schedule.Putresult_Post(item)
def download_picture(self, image_url, path = "", filename = ""): if path is None: if Config_paser.locate is None: path = os.path.getcwd() else: path = Config_paser.locate INFO("[Download] the picture will be downloaded in {}".format(path)) if filename is not None: os.path.join(path, '{}.jpg'.format(filename)) else: os.path.join(path, '{}.jpg'.format(image_url)) try: image = requests.get(image_url, stream = True) with open(path, 'wb') as img: img.write(image.content) except Exception as e: WARNING(e)
def AddTodo_Post(self, object): if not isinstance(object, object): WARNING("[Schedule] [POST] [POST]the wrong params!!") return obj = object item = [] if (obj.url is None or obj.request is None): WARNING("[Schedule] [POST] without url or request??") return item.append(obj.url) item.append(obj.method) item.append(obj.request) item.append(obj.headers) item.append(obj.func) INFO( "[Schedule] [POST] url = %s, method = %s, request = %s, headers = %s", obj.url, obj.method, obj.request, obj.headers) self.Works_post.append(item)
def download_anything(self, down_url, path = "", filename = ""): if path is None: if Config_paser.locate is None: path = os.path.getcwd() else: path = Config_paser.locate INFO("[Download] [download_anthing] the file will be downloaded in {}".format(path)) if filename is not None: file = filename.split('.') if file[1] is None: filename += '.dat' else: filename = down_url + '.dat' os.path.join(path, filename) try: result = requests.get(down_url, stream = True) with open(path, 'wb') as things: things.write(result.content) except Exception as e: WARNING(e)
def Get_result_Post(self): item_list = copy.deepcopy(self.result_post) INFO("[Schedule] [result] get post result !!") return item_list
def __init__(self, *args, **kwargs): try: self.url = kwargs.pop('url') INFO("[request] url to Crawer = {}".format(self.url)) except Exception: self.url = None WARNING("[request] without url???, this request will be stopped!!") return try: self.method = kwargs.pop('method') except Exception: self.method = None WARNING("[request] without method auto set method to 'GET'") self.method = 'GET' try: formdata = kwargs.pop('formdata') if (self.method == "GET"): WARNING( "[request] ...there exists dict of formdata turn 'GET' to 'POST'" ) self.method = 'POST' except Exception: formdata = None if (self.method == 'POST'): WARNING( "[request]...formdata is empty auto turn 'POST' to 'GET'") self.method = 'GET' if formdata: items = formdata.iteritems() if isinstance(formdata, dict) else formdata self.formdata = [(str_to_unicode(k, encoding = 'utf-8'), str_to_unicode(v, encoding = 'utf-8')) \ for k,v in items] try: self.headers = kwargs.pop('headers') except Exception: self.headers = None INFO("[request] without header, set header to None") try: self.callback = kwargs.pop('callback') except Exception: self.callback = None INFO("[request] nothing to callback!!") try: self.filename = kwargs.pop('filename') except Exception: self.filename = None INFO("[request] without filename.......") try: self.download_type = kwargs.pop('download_type') except Exception: if self.method == "DOWNLOAD": WARNING( "[request] method is download,but there isn't exists and method??, stop it!!" ) return try: self.headers = kwargs.pop('headers') except Exception: self.headers = None try: self.meta = kwargs.pop('meta') except Exception: self.meta = None