def putRequest(queue, payload=None): response = {} statusCode = {} data = {} while not queue.empty(): resourceURI = queue.get(timeout=DMON_TIMEOUT) response['Node'] = resourceURI try: if payload is None: r = requests.put(resourceURI, timeout=20) else: r = requests.put(resourceURI, data=payload, timeout=20) if r.headers['Content-Type'] == 'application/json': data = r.json else: data = r.text response['StatusCode'] = r.status_code response['Data'] = data except requests.exceptions.Timeout: response['StatusCode'] = 408 response['Data'] = data except requests.exceptions.ConnectionError: response['Node'] = resourceURI statusCode['StatusCode'] = 404 response['Data'] = 'n/a' GreenletRequests.NodeResponsesPost.append(response) # print 'Threaded PUT with ID ' + str(GreenletRequests.npo) + ' executed for ' + resourceURI app.logger.info( '[%s] : [INFO] Thread PUT with ID %s executed for %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(GreenletRequests.ng), resourceURI) GreenletRequests.npo += 1 gevent.sleep(0)
def randomT(queue, name): while not queue.empty(): t = queue.get(timeout=1) gevent.sleep(5) print "I am + " + name + " executing " + str(GreenletRequests.ng) GreenletRequests.ng += 1 gevent.sleep(0)
def putRequest(queue, payload=None): response = {} statusCode = {} data = {} while not queue.empty(): resourceURI = queue.get(timeout=1) response["Node"] = resourceURI try: if payload is None: r = requests.put(resourceURI, timeout=20) else: r = requests.put(resourceURI, data=payload, timeout=20) if r.headers["Content-Type"] == "application/json": data = r.json else: data = r.text response["StatusCode"] = r.status_code response["Data"] = data except requests.exceptions.Timeout: response["StatusCode"] = 408 response["Data"] = data except requests.exceptions.ConnectionError: response["Node"] = resourceURI statusCode["StatusCode"] = 404 response["Data"] = "n/a" GreenletRequests.NodeResponsesPost.append(response) print "Threaded PUT with ID " + str(GreenletRequests.npo) + " executed for " + resourceURI GreenletRequests.npo += 1 gevent.sleep(0)
def deleteRequest(queue): response = {} while not queue.empty(): resURI = queue.get(timeout=DMON_TIMEOUT) try: r = requests.delete(resURI, timeout=DMON_TIMEOUT) data = r.json() response['Node'] = resURI response['StatusCode'] = r.status_code response['Data'] = data except requests.exceptions.Timeout: response['Node'] = resURI response['StatusCode'] = 408 response['Data'] = 'n/a' except requests.exceptions.ConnectionError: response['Node'] = resURI response['StatusCode'] = 404 response['Data'] = 'n/a' GreenletRequests.NodeResponsesGet.append(response) # print 'Threaded DELETE with ID ' + str(GreenletRequests.nd) + ' executed for ' + resURI app.logger.info( '[%s] : [INFO] Thread DELETE with ID %s executed for %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(GreenletRequests.ng), resURI) GreenletRequests.nd += 1 gevent.sleep(0)
def _download_helper(): while not queue.empty(): h = queue.get() if not h: break r = requests.get(VT_DOWNLOAD, params={"apikey": apikey, "hash": h}) open(h, "wb").write(r.content)
def response_generator(): ii = 0 while npending or not queue.empty(): ii += 1 result = queue.get() msg = '{} {}\n'.format(ii, result) print(msg, end='') yield msg t2 = datetime.datetime.now() print('====', t2 - t1)
def RegexpMatchWait(queue): if queue.empty(): gevent.sleep(1) return '' (tweet_dic, match_result) = queue.get() if tweet_dic is None or match_result is None: return "\n" result_dic = tweet_dic.copy() result_dic['match_result'] = match_result logging.info('waiting tweet text got: %s' % str(result_dic)) return "%s\n" % json.dumps(result_dic)
def randomT(queue, name): while not queue.empty(): t = queue.get(timeout=1) gevent.sleep(5) # print 'I am + ' + name + ' executing ' + str(GreenletRequests.ng) app.logger.info( '[%s] : [INFO] %s executing %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), name, str(GreenletRequests.ng)) GreenletRequests.ng += 1 gevent.sleep(0)
def send_message(socket): global queue while True: try: if not queue.empty(): #print("QUEUE NOT EMPTY") message = queue.get(block=False) if not socket.closed: socket.send(json.dumps(message)) #print('Sent response') #We need a sleep call so that other greenlets can run gevent.sleep() except Exception as e: print("SEND: %s" % e) raise e
def init(): #queue init #main.queue.put("") #main.pool.spawn(getLink).join() #give worker pool print('start crwaling') #while not pool.free_count() == 15: while not queue.empty(): gevent.sleep(0.8) for x in range(0, min(queue.qsize(), pool.free_count())): pool.spawn(getData) #wait for everything complete pool.join()
def task_thread(self, queue): """ Executes tasks in queue """ while not self.shutdown.is_set(): if queue.empty() is False: (job, task) = queue.get_nowait() # Don't run the task if the job is done if job.status in [Status.ERROR, Status.ABORT]: task.status = Status.ABORT else: options = {} gpu_id = -1 try: if isinstance(task, model_tasks.TrainTask): ### Select GPU if len(self.gpu_list): for gpu in self.gpu_list: if not gpu['active']: gpu_id = gpu['index'] gpu['active'] = True break assert gpu_id != -1, 'no available GPU' else: gpu_id = None options['gpu_id'] = gpu_id task.run(**options) except Exception as e: logger.error('%s: %s' % (type(e).__name__, e), job_id=job.id()) task.exception = e task.traceback = traceback.format_exc() task.status = Status.ERROR finally: ### Release GPU if gpu_id != -1 and gpu_id is not None: for gpu in self.gpu_list: if gpu['index'] == gpu_id: gpu['active'] = False else: # Wait before checking again for a task time.sleep(utils.wait_time())
def _download_helper(): t = time.time() while not queue.empty(): h = queue.get() if not h: break if h == "wait": time.sleep(max(0, 60 - time.time() + t)) t = time.time() continue if os.path.exists(h): print "skipping..", h continue r = requests.get(VT_DOWNLOAD, params={"apikey": apikey, "hash": h}) open(h, "wb").write(r.content)
def getrequestFile(queue, output): response = {} while not queue.empty(): resURI = queue.get(timeout=1) app.logger.info( '[%s] : [INFO] Thread File GET with ID %s starts execution for %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(GreenletRequests.ng), resURI) hostURL = urlparse(resURI) hostID = hostURL.hostname logName = 'worker-%s.tar' % hostID logDump = os.path.join(output, logName) try: r = requests.get(resURI, timeout=DMON_TIMEOUT, stream=True) if r.status_code == 200: with open( logDump, 'wb') as out_file: # TODO investaigate chunck writter shutil.copyfileobj(r.raw, out_file) response['Node'] = resURI response['StatusCode'] = r.status_code response['LogName'] = logDump response['Headers'] = r.headers del r except requests.exceptions.Timeout: response['Node'] = resURI response['StatusCode'] = 408 response['LogName'] = logDump except requests.exceptions.ConnectionError: response['Node'] = resURI response['StatusCode'] = 404 response['LogName'] = logDump GreenletRequests.NodeResponsesGet.append(response) # print 'Threaded GET with ID ' + str(GreenletRequests.ng) + ' executed for ' + resURI app.logger.info( '[%s] : [INFO] Thread File GET with ID %s executed for %s', datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(GreenletRequests.ng), resURI) GreenletRequests.ng += 1 gevent.sleep(0)
def scrape_base_url(): global data startTime = datetime.now() tree = html.fromstring(session.get(base_url).text) func = lambda x: queue.put_nowait((parse_comp, { 'url': domain + x.xpath('./@href')[0], 'name': x.xpath('./text()')[0] })) [ func(x) for x in tree.xpath('//div[@class="st-text"]//td/a') if x.xpath('./text()') != [] ] while not queue.empty() and not pool.full(): for x in xrange(0, min(queue.qsize(), pool.free_count())): t = queue.get_nowait() pool.start(pool.spawn(t[0], t[1])) pool.join() print 'Time Taken : ', datetime.now() - startTime with open('data.json', 'w') as fp: json.dump(data, fp)
def deleteRequest(queue): response = {} while not queue.empty(): resURI = queue.get(timeout=1) try: r = requests.delete(resURI, timeout=2) data = r.json() response["Node"] = resURI response["StatusCode"] = r.status_code response["Data"] = data except requests.exceptions.Timeout: response["Node"] = resURI response["StatusCode"] = 408 response["Data"] = "n/a" except requests.exceptions.ConnectionError: response["Node"] = resURI response["StatusCode"] = 404 response["Data"] = "n/a" GreenletRequests.NodeResponsesGet.append(response) print "Threaded DELETE with ID " + str(GreenletRequests.nd) + " executed for " + resURI GreenletRequests.nd += 1 gevent.sleep(0)
def getData(): #가지고 있는 url만큼만 loop global error_count error_log = open('./err.txt', mode='a') while not queue.empty(): #저장되어있는 link를 queue에서 가져옴 #pool의 worker들이 link로 request 동기보다 n배 빠름 link = queue.get(timeout=0) if link != "": gevent.sleep(0.3) getdata = requests.get(link) soup = BS(getdata.text, 'lxml-xml') #validation check okflag = soup.find('resultCode') try: if okflag.text != '00': print("okflag: ", okflag.text) raise ValueError('okcode is not 00') else: #검색잘되면 엑셀 파싱 #pool map method vs pool map_async #어떤것이 더 효율이 좋을지 결정필요 print(len(soup.find_all('item'))) pool_excel.map(makeCSV, soup.find_all('item')) except: error_log.write(link + '\n') error_log.write('==================================\n') error_count += 1 error_log.write(str(error_count) + '\n') queue.put(link) print('stop crwaling') print(main_row) error_log.close()
if result != '': print 'Found [%s][%s] in %s' % (result, link, tag) a += 1 if tag in json_dict: json_dict[tag].append((result, link, imgs[i])) else: json_dict[tag] = list() json_dict[tag].append((result, link, imgs[i])) r = session.get(url) tree = html.fromstring(r.text) a_tags = tree.xpath('//li[@class="menu-item"]//a') tags = [(x.xpath('.//@href'), repr(x.xpath('.//text()'))) for x in a_tags] for t in tags: url = t[0] result = regex.findall(t[1]) # print url, result # scrape(url[0], result[0]) queue.put((url[0], result[0])) while not queue.empty() and not pool.full(): for x in xrange(0, min(queue.qsize(), pool.free_count())): pool.spawn(worker) pool.join() print a print 'Time Taken : ', datetime.now() - start_time with open('data.json', 'w') as fp: json.dump(json_dict, fp)
(response.status_code, url)) except gevent.queue.Empty: print('queue empty') break if __name__ == '__main__': if len(sys.argv) != 3: print('USAGE:\n\t%s <base_url> <entry_path>' % sys.argv[0]) sys.exit(1) if validators.url(sys.argv[1]) != True: print('Invalid Url') sys.exit(1) queue.put(getUrl(sys.argv[2])) pool.spawn(crawler) while 1: if queue.empty() and pool.free_count() == WORKER_COUNT: print('No more links left and nothing running') break for x in range(0, min(queue.qsize(), pool.free_count())): pool.spawn(crawler) gevent.sleep(0.1) # Wait for everything to complete pool.join()
break print "job done" handler.log("job done") print "so far crawled %s pages" % crawled handler.log("so far crawled %s pages" % crawled) queue.put(start_url_1) queue.put(start_url_2) pool.spawn(crawler) handler = Handler() print 'starting Crawler...' handler.log('starting Crawler...') while not queue.empty() and not pool.free_count() == workers_count: gevent.sleep(0.8) for x in xrange(0, min(queue.qsize(), pool.free_count())): pool.spawn(crawler) #wait for jobs to finish pool.join() print "Done" handler.log("Done+\n") print '\n' print "collected %s imgs" % ITEMS_COUNT handler.log("collected %s imgs" % ITEMS_COUNT) print "see generated output and log files" handler.close() #close the IO files
global crawled while 1: try: u = queue.get(timeout=0) response = requests.get(u) print response.status_code, u for link in re.findall('<a href="(http.*?)"', response.content): if crawled < 10: crawled += 1 queue.put(link) except gevent.queue.Empty: break # Read the seed url from stdin queue.put(sys.argv[1]) pool.spawn(crawler) while not queue.empty() and not pool.free_count() == 5: gevent.sleep(0.1) for x in xrange(0, min(queue.qsize(), pool.free_count())): pool.spawn(crawler) # Wait for everything to complete pool.join() print datetime.now() - startTime # Took 5.943 seconds, varying
def crawler(): '''A very simple queued gevent web crawler''' global crawled while 1: try: u = queue.get(timeout=1) response = requests.get(u) print(response.status_code) # Extract some links to follow for link in re.findall('<a href="(http.*?)"', response.content): # Limit to 10 pages (ignores links when the pool is already full) if crawled < 10: crawled += 1 queue.put(link) except gevent.queue.Empty: break queue.put(sys.argv[1]) while not queue.empty() and not pool.free_count() == 5: gevent.sleep(0.1) for x in xrange(0, min(queue.qsize(), pool.free_count())): pool.spawn(crawler) # Wait for everything to complete pool.join()