def __init__(self): self.filehandle = FileHandler() self.request_handle = Request() self.calculate = Calculation() self.url = None self.range_left = None self.range_right = None self.proxy = None self.temp_dir = None self.threads = None self.filepath = None logging.getLogger("urllib3").setLevel(logging.WARNING)
def __init__(self, start_url, subdomains): self.start_req = Request('get', start_url, '/') self.scheduler = Scheduler(subdomains) self.spider = Spider() self.downloader = Downloader() # 加入初始请求 self.scheduler.put_request(self.start_req)
async def get_request(self): while True: data = await self.client.rpop('EngineQueue') if data: req = json.loads(data) return Request(method=req['method'], target=req['target'], path=req['path'], headers=req['headers'], body=req['body']) return None
def logLocationRequest(): apiKey = returnAPIKey("ip_stack") ipStackEndpoint = "http://api.ipstack.com/81.180.208.125?access_key=" + str( apiKey) ipStackEndPointToLog = "http://api.ipstack.com/81.180.208.125?access_key=" detailedResponse = requests.get(ipStackEndpoint) request = Request("IP_Stack", ipStackEndPointToLog, detailedResponse.encoding, detailedResponse.status_code, detailedResponse.elapsed.total_seconds()) writeJsonToFile(request)
def before_request(): g.headers = {} g.pagination = Pager(request.args) g.request = Request(request) g.auth = None g.perms = {} g.im_rds = rds cnf = MYSQL g._db = Mysql(*cnf) g._imdb = g._db
def run(self): size = 1024 # receive {"url":"", "range-left":"", "range-right":""} from client msg = self.client_conn.recv(size) if msg: msg = msg.decode() print("[+] Received Message: {}".format(msg)) msg = json.loads(msg) # generate a random name for file filename = Calculation().generate_random_string(12) filepath = self.temp_dir + filename # use request to download url = msg['url'] range_left = msg['range-left'] range_right = msg['range-right'] response = Request().make_request(url, self.proxy) # use Multiprocess to download using multithreading print("starting new process to download {}".format(filename)) process = multiprocessing.Process( target=MultithreadedDownloader().download, args=( url, range_left, range_right, filepath, self.temp_dir, response, self.threads, self.proxy, ) ) process.start() process.join() print('Out of process for file {}'.format(filename)) # send the downloaded file part to peer-client self.send_file_part(filepath) # let peer-client know that file sending is done self.client_conn.shutdown(socket.SHUT_RDWR) # close connection with peer-client self.client_conn.close() print("[-] Client Disconnected: {}".format(self.client_addr)) # delete temp file FileHandler().delete_file(filepath) print("[-] Temp File Deleted.")
def logGetCityBasedOnCoordinates(): apiKey = returnAPIKey("google_geo") latitude = str(getLocation()["latitude"]) longitude = str(getLocation()["longitude"]) googleMapsEndPoint = "https://maps.googleapis.com/maps/api/geocode/json?latlng=" + latitude + "," + longitude + \ "&key=" + str(apiKey) googleMapsEndPointToLog = "https://maps.googleapis.com/maps/api/geocode/json?latlng=" + latitude + "," + longitude + \ "&key=" detailedResponse = requests.get(googleMapsEndPoint) request = Request("Google_GEO", googleMapsEndPointToLog, detailedResponse.encoding, detailedResponse.status_code, detailedResponse.elapsed.total_seconds()) writeJsonToFile(request)
def logGetWeatherBasedOnCity(): apiKey = returnAPIKey("open_weather") plusCode = str(getCityBasedOnCoordinates()["plus_code"]) city = plusCode.split().__getitem__(2).replace(",", "") openWeatherEndPoint = "https://samples.openweathermap.org/data/2.5/weather?q=" + str( city) + "&appid=" + str(apiKey) openWeatherEndPointToLog = "https://samples.openweathermap.org/data/2.5/weather?q=" + str( city) detailedResponse = requests.get(openWeatherEndPoint) request = Request("OpenWeather", openWeatherEndPointToLog, detailedResponse.encoding, detailedResponse.status_code, detailedResponse.elapsed.total_seconds()) writeJsonToFile(request)
def login(self) -> bool: if self._is_logged(): return True data = { "grant_type": "password", "username": self._credentials.username, "password": self._credentials.password } response = Request.run(self._token_url, "POST", data, self._header(False), self._auth()) if response['statusCode'] == 200: self._token = response['body'] self._token['expires_date'] = datetime.fromtimestamp( time.mktime(datetime.now().timetuple())) + timedelta( seconds=int(self._token['expires_in'])) return True return False
def main(opt: Options): conn = opt.get_conn() num = opt.get_number() vehicle_data = { "vehicle": True, "vehicle_name": "", "end": True, } if START_MODE == "by-name": conn.set_kanban(SERVICE_NAME, num) elif START_MODE == "from-kanban": kanban = conn.get_one_kanban(SERVICE_NAME, num) metadata = kanban.get_metadata() vehicle_data = metadata['args'] lprint(f"vehicle_data: {vehicle_data}") templates = [] vehicle = vehicle_data['vehicle'] vehicle_name = vehicle_data['vehicle_name'] end = vehicle_data['end'] try: ms = MySQLClient() if vehicle and not vehicle_name: templates += ms.get_all_vehicles() lprint("set_template all") elif vehicle and vehicle_name: templates += ms.get_by_vehicle_name(vehicle_name) lprint(f"set_template {vehicle_name}") if end: templates += ms.get_end() lprint("set_template end") if templates: with Request() as r: r.set_templates(templates) except Exception as e: print(str(e))
class Crawler(): HOST = "www.nbiquge.com" SCHEMA = "https://" LIST_URL_KEY = "nbiquge_list_url" CHAPTER_URL_KEY = "nbiquge_chapter_url" # 记录章节顺序 INCR_KEY = "nbiquge_incr_key" def __init__(self, start_url="https://www.nbiquge.com/7_7295/"): logging.info("crawler init...") self.start_url = start_url self.request = Request() redisClient.lpush(self.LIST_URL_KEY, start_url) def run(self): self.consume_list() def consume_list(self): ''' 串行取数据 每次从redis中取出一个list 抓取相应章节后 再抓取下一页的链接 如果取不到新的链接 则程序结束 ''' list_url = redisClient.rpop(self.LIST_URL_KEY) if list_url is None: return logging.info("get url %s", list_url) self.parse_list(list_url) def parse_list(self, url): logging.info("parse_list get url: %s" % url.decode("utf-8")) rsp = self.request.get(url.decode("utf-8")) rsp.encoding = "gbk" # 指定rsp的编码方式(否则解码后会有乱码) root = fromstring(rsp.text) # import pdb; pdb.set_trace() list_div = root.xpath('//div[@id="list"]')[0] chapters = list_div.xpath("//dd//a") logging.info("get %s chapter" % len(chapters)) for chapter in chapters: chapter_id = redisClient.incr(self.INCR_KEY) task = { "href": self.SCHEMA + self.HOST + chapter.xpath(".//@href")[0], "name": chapter.xpath("string(.)"), "id": chapter_id } redisClient.lpush(self.CHAPTER_URL_KEY, umsgpack.packb(task)) if chapter_id % 10 == 0: ''' 每10章 ''' logging.info("now chapter_id: %s" % chapter_id) self.consume_chapter() def consume_chapter(self): while True: #import pdb; pdb.set_trace() task = redisClient.lpop(self.CHAPTER_URL_KEY) if task is None: return self.consume_single_chapter(task) def consume_single_chapter(self, task): logging.info("consume_single_chapter get a task") task = umsgpack.unpackb(task) rsp = self.request.get(task["href"]) rsp.encoding = "gbk" root = fromstring(rsp.text) content_div = root.xpath('//div[@id="content"]')[0] content = tostring(content_div, method="html", pretty_print=True, encoding="utf-8") content = content.decode("utf-8") content = content.replace("<br>", "") content = content.replace("\xa0", "") content = content.replace('<div id="content">', '') content = content.replace('</div>', "") # import pdb; pdb.set_trace() sqlSession = sqlalchemyConn.DBSession() chapter = Chapter(chapter_id=task["id"], title=task["name"], content=content, book_id=book_id, site=self.HOST) sqlSession.add(chapter) sqlSession.commit() @classmethod def restart(self): ''' 清除数据 ''' redisClient.delete(self.LIST_URL_KEY) redisClient.delete(self.CHAPTER_URL_KEY) redisClient.delete(self.INCR_KEY)
async def worker_job(client_socket: socket, worker_name: str): if Config.log_worker_verbose: logging.debug(f'WORKER_{worker_name}: spawned') # GET REQUEST loop = asyncio.get_event_loop() request_raw = "" while True: request_part = (await loop.sock_recv(client_socket, Config.bytes_per_recv)).decode() request_raw += request_part if '\r\n' in request_raw or len(request_part) == 0: break request = Request(request_raw) # GET FILENAME filepath: str search_folder = request.url.endswith('/') if search_folder: filepath = Config.base_dir + request.url + Config.index_filename else: filepath = Config.base_dir + request.url file_exists = os.path.exists(filepath) # CREATE RESPONSE response: Response if request.method not in ['GET', 'HEAD']: response = Response(method=request.method, protocol=request.protocol, status=405) elif '/..' in request.url or (search_folder and not file_exists): response = Response(method=request.method, protocol=request.protocol, status=403) elif (not file_exists) or (not request.is_valid): response = Response(method=request.method, protocol=request.protocol, status=404) else: response = Response(method=request.method, protocol=request.protocol, status=200, filepath=filepath) logging.info( f'WORKER_{worker_name}: {response.status} {request.method} {request.url}' ) # SEND RESPONSE await response.send(client_socket) # END WORKER client_socket.close() if Config.log_worker_verbose: logging.debug(f'WORKER_{worker_name}: closed client socket') if Config.log_worker_verbose: logging.debug(f'WORKER_{worker_name}: done')
def before_request(): g.headers = {} g.pagination = Pager(request.args) g.request = Request(request) g.auth = None g.perms = {}
# check if download url supplied if (len(sys.argv) < 2): print("No Download URL! Exiting ...") sys.exit(0) url = sys.argv[1] client = ThreadedPeerClient(url) # port used by peer-client to communicate with tracker client_tracker_bind_port = peer_client_config.client_tracker_bind_port # fetch the list of active servers client.fetch_peers_list(tracker_server_address, client_tracker_bind_port) # make request to url to get information about file req = Request() response = req.make_request(url, proxy=proxy) req.close_connection(response) # get the filesize filesize = int(response.headers['Content-Length']) filename = os.path.basename(url.replace("%20", "_")) filepath = download_dir + '/' + filename # if range-download is not supported, use simple download if response.headers['Accept-Ranges'] != 'bytes': print( "URL doesn't support range download! Using default download..." ) MultithreadedDownloader().download(url, 0, filesize - 1, filepath, temp_dir, response, threads,
def _request(self, url: str, data: dict = None, method: str = "GET") -> Dict[str, Any]: return Request.run(url, method, data, self._header(), self._auth())
async def test(self): # 插入初始数据 await self.client.lpush( 'DownloaderQueue', pickle.dumps(Request('get', 'https://www.baidu.com', '/')))
class MultithreadedDownloader: """Main class providing interface of the software""" def __init__(self): self.filehandle = FileHandler() self.request_handle = Request() self.calculate = Calculation() self.url = None self.range_left = None self.range_right = None self.proxy = None self.temp_dir = None self.threads = None self.filepath = None logging.getLogger("urllib3").setLevel(logging.WARNING) def range_download_support(self, resp): """ returns boolean value indicating support for range downloading """ try: supported = (resp.headers['Accept-Ranges'] == 'bytes') except KeyError: supported = False return supported def multithreaded_download(self, ranges_list): """ function to perform multithreaded download """ # downloading each segment for f in range(self.threads): # calling Downloader.download_range() for each thread t = threading.Thread(target=self.request_handle.download_range, kwargs={ 'url': self.url, 'filepath': self.temp_dir + "/temp" + str(f), 'range_left': ranges_list[f][0], 'range_right': ranges_list[f][1], 'proxy': self.proxy }) t.setDaemon(True) t.start() # except main_thread, calling join() for each thread # it ensures that merging of parts occur only after each thread has completed downloading main_thread = threading.current_thread() for t in threading.enumerate(): if t is main_thread: continue t.join() def merge_multithreaded_download_parts(self): """ function to perform merging of parts performed by multiple threads on single system """ # merging parts with open(self.filepath,'wb') as wfd: for f in range(self.threads): tempfilepath = self.temp_dir + "/temp" + str(f) with open(tempfilepath, "rb") as fd: shutil.copyfileobj(fd, wfd) # delete copied segment self.filehandle.delete_file(tempfilepath) def download(self, url, range_left, range_right, filepath, temp_dir, response, threads, proxy=None): """ function to perform file download """ self.url = url self.range_right = range_right self.range_left = range_left self.filepath = filepath self.temp_dir = temp_dir self.threads = threads self.proxy = proxy # if server supports segmented download if self.range_download_support(response): # get ranges for download for each thread ranges_list = self.calculate.get_download_ranges_list(self.range_left, self.range_right, self.threads) # perform multithreaded download on single system self.multithreaded_download(ranges_list) # merge multithreaded download parts self.merge_multithreaded_download_parts() else: print('''Server doesn't support multithreaded downloads! Download will be performed using single thread, on master system.''') self.request_handle.download_range(self.url, self.filepath, self.range_left, self.range_right, self.proxy)
def __init__(self, start_url="https://www.nbiquge.com/7_7295/"): logging.info("crawler init...") self.start_url = start_url self.request = Request() redisClient.lpush(self.LIST_URL_KEY, start_url)
norm_uri = ''.join([req.target, path_split[0]]) query = '' if len(path_split) > 1: query = path_split[1] # 请求参数规范化 new_query = [] for k, v in map(self.split_query, query.split('&')): if self.is_data(k, v) and not self.is_action(k, v): new_query.append('='.join([k, '[data]'])) else: new_query.append('='.join([k, v])) new_query = '&'.join(new_query) return '?'.join([norm_uri, new_query]) def put_request(self, req): now_req_hash = self.get_request_hash(req) if now_req_hash not in self.req_hash and self.in_subdomains( req.target) and not self.is_static(req.path): self.req_queue.put(req) self.req_hash.add(now_req_hash) def get_request(self): return self.req_queue.get() if __name__ == "__main__": req = Request('get', 'http://s:80', '/') scheduler = Scheduler() a = scheduler.get_request_hash(req) print(a)