def crawl_worker(agent_cfg, url_tuple): """Crawl given url. Will work in parallel. Cannot be class method.""" MAX_SLEEP_BEFORE_JOB = 10 # prevent starting all parallel processes at the same instance sleep(random() * MAX_SLEEP_BEFORE_JOB) # sleep for a while try: idx, url = url_tuple idx = str(idx) stdout_log = os.path.join(agent_cfg['job_dir'], fu.get_out_filename_from_url(url, str(idx), '.txt')) if not url[:5] in ('data:', 'http:', 'https', 'file:'): url = 'http://' + url proxy_opt = mitm.init_mitmproxy(stdout_log[:-4], agent_cfg['timeout'], agent_cfg['mitm_proxy_logs']) if agent_cfg['use_mitm_proxy'] else "" if not 'chrome_clicker' in agent_cfg['type']: cmd = get_visit_cmd(agent_cfg, proxy_opt, stdout_log, url) wl_log.info('>> %s (%s) %s' % (url, idx, cmd)) status, output = ut.run_cmd(cmd) # Run the command if status and status != ERR_CMD_TIMEDOUT: wl_log.critical('Error while visiting %s(%s) w/ command: %s: (%s) %s' % (url, idx, cmd, status, output)) else: wl_log.info(' >> ok %s (%s)' % (url, idx)) else: cr.crawl_url(agent_cfg['type'], url, proxy_opt) sleep(2) # this will make sure mitmdump is timed out before we start to process the network dump if agent_cfg['post_visit_func']: # this pluggable function will parse the logs and do whatever we want agent_cfg['post_visit_func'](stdout_log, crawl_id=agent_cfg['crawl_id']) except Exception as exc: wl_log.critical('Exception in worker function %s %s' % (url_tuple, exc))
def test_should_timeout_onbeforeunload(self): test_url = cm.BASE_TEST_URL + "crawler/onbeforeunload.html" try: ut.timeout(cr.CRAWLER_CLICKER_VISIT_TIMEOUT + 2) cr.crawl_url("chrome_clicker", test_url, "") except ut.TimeExceededError as texc: self.fail("Crawl has timed out %s" % texc)
def __init__(self, html=' ', url=None, base_href=None, **extras): """ init the class :param html: the html doc crawled :param url: if url param is set, the html param is ignored, and crawl the url page :param base_href: the base href settings in a page :param extras: extra settings here to add """ if url is None: self.html = html else: crawled_page = crawl_url(url=url) self.html = crawled_page self.base_href = base_href self._parsed = build_html_tree(self.html, self.base_href) self.encode = get_encode_type(self.html) self.content_node = None
def grid_search(ra_0, ra_1, dec_0, dec_1, interval): ra_0, ra_1 = sorted([ra_0, ra_1]) dec_0, dec_1 = sorted([dec_0, dec_1]) ra_length = int((ra_1 - ra_0) // interval) dec_length = int((dec_1 - dec_0) // interval) datafilename = 'data/%g-%g-%g-%g-%g.csv' % (ra_0, ra_1, dec_0, dec_1, interval) if (os.path.isfile(datafilename)): print('File already exists.') return datafilename f = open(datafilename, 'w') f.write('ObjectNo, ra, dec, type, u, g, r, i, z\n') ObjectList = [] num = ra_length * dec_length print('Started crawling %d spots...' % (num)) for ra_i in range(ra_length): for dec_i in range(dec_length): ra = ra_0 + interval * ra_i dec = dec_0 + interval * dec_i data = crawl_url(ra, dec, 0.5) if (data == 0): continue else: if (data[3] == 'STAR'): if (data[0] not in ObjectList): # to avoid duplication ObjectList.append(data[0]) f.write(", ".join(data) + "\n") x = 1 + dec_i + ra_i * dec_length if (x % 10 == 0): print('Crawled %d out of %d spots' % (x, num)) print('Done!') return datafilename
def work(self, should_continue): if not should_continue(): return context = zmq.Context() with pull_socket(context, self._from_bind) as source, \ push_socket(context, self._to_bind) as destination: poller = zmq.Poller() poller.register(source, zmq.POLLIN) poller.register(destination, zmq.POLLOUT) while should_continue(): url, data = None, None presults = dict(poller.poll(timeout=10)) if len(presults) == 2: try: url, data = source.recv_multipart(zmq.NOBLOCK) self.log.info([url, data]) except zmq.ZMQError as ze: self.log.error("something bad happened maybe\n\n %s" % str(ze)) if url and data: response = None try: data = json.loads(data) response = crawl_url(url=url, etag=data["etag"], last_modified=data["last_modified"]) if response: self.log.info("got response for %s" % url) except Exception: self.log.error("could not crawl %s" % url) if response: try: self.log.info("sending to destination....") destination.send_multipart([url, str(response)]) except zmq.ZMQError as ze: self.log.error("could not send result of crawl of %s \n\n %s" % (url, str(ze))) self.log.info("did something...") self.log.info([presults, source in presults, destination in presults])
from crawler import crawl_url, crawl_news # 금융(259) 카테고리 2019년 07월 03일 1페이지 url 수집 urlDatas = crawl_url(259, '20190701', 1) print('작성 언론사 : ' + urlDatas[1]['press']) print('url : ' + urlDatas[1]['url']) url = urlDatas[3]['url'] newsData = crawl_news(url) # 뉴스 제목 title = newsData['title'] print(title) print("\n") # 출판 날짜 publishedAt = newsData['publishedAt'] # 썸네일 URL thumbnail = newsData['thumbnail'] # 뉴스 본문 content = newsData['content'] # 네이버 뉴스 요약문 베타 summary = newsData['summary'] # 좋아요, 훈훈해요, 슬퍼요, 화나요, 후속기사 원해요 like = newsData['like'] warm = newsData['warm'] sad = newsData['sad']
categoryList = [ 264, 265, 268, 266, 267, 269, 259, 258, 261, 771, 260, 262, 310, 263, 249, 250, 251, 254, 252, '59b', 255, 256, 276, 257, 241, 239, 240, 237, 238, 376, 242, 243, 244, 248, 245, 231, 232, 233, 234, 322, 731, 226, 227, 230, 732, 283, 229, 228 ] for category in categoryList: startDate = 20190601 endDate = 20190602 page = 1 crawledUrl = [] while True: if startDate == endDate: break urlDatas = crawl_url(category, str(startDate), page) for urlData in urlDatas: ID = str(startDate) + randomString() try: url = urlData['url'] if url in crawledUrl: page = 20 break else: crawledUrl.append(url) newsData = crawl_news(url) title = newsData['title'] content = newsData['content'] like = newsData['like'] angry = newsData['angry']