def __init__(self, exit_event): self.exit_event = exit_event self.was_pages = {} self.db = DbPg(logger=None) self.rqueue = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME) self.wait_queue() self.init_progress_table() self.get_ready_tasks()
def __init__(self, exit_event): self.exit_event = exit_event self.log = init_logger('cars_url_generator') self.was_pages = {} self.db = DbPg(self.log) self.rqueue = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME) self.wait_queue() self.init_progress_table() self.get_ready_tasks()
class ShipsGenerator: def __init__(self, exit_event): self.exit_event = exit_event self.log = init_logger('ships_url_generator') self.was_pages = {} self.db = DbPg(self.log) self.rqueue = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME) self.wait_queue() self.init_progress_table() self.get_ready_tasks() def wait_queue(self): while self.rqueue.count() > 0: self.log.info('Generator waiting ...') if self.exit_event.wait(10): break def get_ready_tasks(self): query = '''SELECT * FROM pages''' for row in self.db.get_query(query): self.was_pages[row[0]] = True self.log.debug(f'total ready tasks: {len(self.was_pages)}') def run(self): for i in range(MAX_PAGES): if self.exit_event.is_set(): break if self.was_pages.get(i): continue msg = {'url': PAGE_URL.format(num=i), 'num': i} self.log.debug(f'[{i}]: queue size is: {self.rqueue.count()}') while self.rqueue.count() > MAX_QUEUE_SIZE: self.log.info('Queue too big, wait') if self.exit_event.wait(5): return self.rqueue.publish(msg) self.log.info('all tasks are generated') def init_progress_table(self): query = '''CREATE TABLE IF NOT EXISTS pages (page_num integer UNIQUE )''' self.db.exec_query(query)
class ShipsGenerator: def __init__(self, exit_event): self.exit_event = exit_event self.was_pages = {} self.db = DbPg(logger=None) self.rqueue = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME) self.wait_queue() self.init_progress_table() self.get_ready_tasks() def wait_queue(self): while self.rqueue.count() > 0: if self.exit_event.wait(10): break def get_ready_tasks(self): query = '''SELECT * FROM pages''' for row in self.db.get_query(query): self.was_pages[row[0]] = True def run(self): for i in range(MAX_PAGES): if self.exit_event.is_set(): break if self.was_pages.get(i): continue msg = {'url': PAGE_URL.format(num=i), 'num': i} while self.rqueue.count() > MAX_QUEUE_SIZE: if self.exit_event.wait(5): return self.rqueue.publish(msg) def init_progress_table(self): query = '''CREATE TABLE IF NOT EXISTS pages (page_num integer UNIQUE )''' self.db.exec_query(query)
class CarGenerator: def __init__(self, exit_event): self.exit_event = exit_event self.log = init_logger('cars_url_generator') self.was_pages = {} self.db = DbPg(self.log) self.rqueue = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME) self.wait_queue() self.init_progress_table() self.get_ready_tasks() def wait_queue(self): while self.rqueue.count() > 0: self.log.info('Generator waiting ...') if self.exit_event.wait(10): break def get_ready_tasks(self): for row in Pages.get_pages(): self.was_pages[row[0]] = True self.log.debug(f'total ready tasks: {len(self.was_pages)}') def run(self): for i in range(MAX_PAGES): if self.exit_event.is_set(): break if self.was_pages.get(i): continue msg = {'url': PAGE_URL.format(num=i), 'num': i} print('run', msg) self.log.debug(f'[{i}]: queue size is: {self.rqueue.count()}') while self.rqueue.count() > MAX_QUEUE_SIZE: self.log.info('Queue too big, wait') if self.exit_event.wait(5): return self.rqueue.publish(msg) self.log.info('all tasks are generated') def init_progress_table(self): delete_Pages() init_db()
def work(self, wnum): self.log.debug(f'{wnum} worker started') rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME) db_connection = DbPg(self.log) driver, prox = self.init_browser() for raw_msg in rab_connection.get_generator(self.exit_event): if not raw_msg: if self.exit_event.wait(2): break continue msg = raw_msg.json() if 'url' not in msg: self.log.warning(f'{wnum}: bad task: {msg}') raw_msg.ack() continue if msg['num'] == 0: msg['url'] = msg['url'].split('?')[0] try: driver.get(msg['url']) self.log.debug(driver.current_url) time.sleep(3) # parse with selenium rows = driver.find_elements_by_css_selector("tr") if not rows: self.log.debug(f'{wnum}: not rows in table') raw_msg.nack(requeue=True) break for row in rows: cells = row.find_elements_by_css_selector("td") if not cells: continue data = { 'img_url': cells[0].find_element_by_css_selector( 'img').get_attribute('src'), 'country': cells[1].find_element_by_css_selector( 'span').get_attribute('title'), 'vessel_name': cells[1].text.split('\n')[0], 'vessel_type': cells[1].text.split('\n')[1], 'year': cells[2].text, 'gt': cells[3].text, 'dwt': cells[4].text, 'sz': cells[5].text } vlength, vwidth = [ int(v.strip()) for v in data['sz'].split('/') ] self.log.debug(data) ship = Ship( sid=None, name=data['vessel_name'], country_name=data['country'], description=f'{data["vessel_type"]}, {data["img_url"]}', built_year=data['year'], length=vlength, width=vwidth, gt=data['gt'], dwt=data['dwt']) db_connection.insert_ship(ship) db_connection.exec_query(f''' INSERT INTO pages (page_num) VALUES({msg['num']}) ''') raw_msg.ack() except Exception as e0: self.log.error(f'{wnum}: get page error: {e0}') raw_msg.nack(requeue=True) if USE_PROXY: self.proxy_gen.back_proxy(prox, str(e0)) driver.close() driver, prox = self.init_browser() time.sleep(random.randrange(1, 5)) rab_connection.close() db_connection.close() self.log.info(f'{wnum}: worker exit')
def work(self, wnum): self.log.debug(f'{wnum} worker started') rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME) db_connection = DbPg(logger=None) for raw_msg in rab_connection.get_generator(self.exit_event): if not raw_msg: if self.exit_event.wait(2): break continue msg = raw_msg.json() print(msg) if 'url' not in msg: self.log.warning(f'{wnum}: bad task: {msg}') raw_msg.ack() continue print() if msg['num'] == 0: msg['url'] = PAGE_URL0 print("0", msg) try: request = requests.get(msg['url'], headers=HEADERS).content soup = BeautifulSoup(request, 'html.parser') self.log.debug(msg['url']) time.sleep(1) names_list = [] container_names = soup.select('div.information-container h2 a') for name in container_names: str_name = name.text print(str_name) names_list.append(str_name) links = [] container_links = soup.select('div.information-container h2 a') for i in container_links: ii = i['href'].split("&")[0] full_link = ("https://www.autotrader.co.uk" + ii) link = full_link.split('?')[0] links.append(link) photos = [] container_photo = soup.select( 'figure.listing-main-image a img') for link_photo in container_photo: photos.append(link_photo['src']) list_price = [] container_text = soup.find_all( "a", attrs={ "class": "js-click-handler listing-fpa-link listings-price-link tracking-standard-link" }) for i in container_text: pr = i.find_all("div", attrs={"class": "vehicle-price"}) str_price = "".join((re.findall(r'[0-9]{,3},[0-9]{,3}', str(pr)))) price = 27 * int(str_price.replace(',', '')) list_price.append(price) for n, l, f, p in zip(names_list, links, photos, list_price): db_session.add(Cars(n, l, f, p)) db_session.commit() data = '{}{}{}{}{}'.format(n, '\t', l, '\t', f, '\t', str(p)) self.log.debug(data) db_session.add(Pages(msg['num'])) db_session.commit() raw_msg.ack() except Exception as e0: self.log.exception()( f'{wnum}: get page error: {e0}') ##self.log.error raw_msg.nack(requeue=True) prox = None if USE_PROXY: self.proxy_gen.back_proxy(prox, str(e0)) time.sleep(random.randrange(1, 5)) rab_connection.close() self.log.info(f'{wnum}: worker exit')
def work(self, wnum): rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME) db_connection = DbPg(logger=None) driver, prox = self.init_browser() for raw_msg in rab_connection.get_generator(self.exit_event): if not raw_msg: if self.exit_event.wait(2): break continue msg = raw_msg.json() if 'url' not in msg: raw_msg.ack() continue if msg['num'] == 0: msg['url'] = msg['url'].split('?')[0] try: driver.get(msg['url']) time.sleep(3) html = driver.page_source dom = lxml_html.fromstring(html) # parse with selenium rows = dom.cssselect("tr") if not rows: raw_msg.nack(requeue=True) break for row in rows: cells = row.cssselect("td") if not cells: continue data = { 'img_url': cells[0].cssselect('img')[0].get('src'), 'country': cells[1].cssselect('span')[0].get('title'), 'vessel_name': cells[1].cssselect('a')[0].text_content().strip(), 'vessel_type': cells[1].cssselect('small')[0].text_content().strip(), 'year': cells[2].text_content(), 'gt': cells[3].text_content(), 'dwt': cells[4].text_content(), 'sz': cells[5].text_content() } vlength, vwidth = [int(v.strip()) for v in data['sz'].split('/')] ship = Ship( sid=None, name=data['vessel_name'], country_name=data['country'], description=f'{data["vessel_type"]}, {data["img_url"]}', built_year=data['year'], length=vlength, width=vwidth, gt=data['gt'], dwt=data['dwt'] ) db_connection.insert_ship(ship) print(12121) db_connection.exec_query(f''' INSERT INTO pages (page_num) VALUES({msg['num']}) ''') raw_msg.ack() except Exception as e0: raw_msg.nack(requeue=True) if USE_PROXY: self.proxy_gen.back_proxy(prox, str(e0)) try: driver.close() except: pass if not self.exit_event.is_set(): driver, prox = self.init_browser() time.sleep(random.randrange(1, 5)) try: rab_connection.close() db_connection.close() driver.close() except: pass
def work(self, wnum): self.log.debug(f'{wnum} worker started') rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME) db_connection = DbPg(logger=None) # driver, prox = self.init_browser() for raw_msg in rab_connection.get_generator(self.exit_event): if not raw_msg: if self.exit_event.wait(2): break continue msg = raw_msg.json() print(msg) if 'url' not in msg: self.log.warning(f'{wnum}: bad task: {msg}') raw_msg.ack() continue print() if msg['num'] == 0: msg['url'] = PAGE_URL0 # msg['url'] = msg['url'].split('?')[0] print("0",msg) try: # driver.get(msg['url']) request = requests.get(msg['url'], headers=HEADERS).content soup = BeautifulSoup(request, 'html.parser') # container = soup.select("li.search-page__result") self.log.debug(msg['url']) # self.log.debug(driver.current_url) time.sleep(1) names_list = [] container_names = soup.select('div.information-container h2 a') for name in container_names: str_name = name.text #name = str_name.strip() print(str_name) names_list.append(str_name) links = [] container_links = soup.select('div.information-container h2 a') for i in container_links: ii = i['href'].split("&")[0] # ii = i['href'] full_link = ("https://www.autotrader.co.uk" + ii) link = full_link.split('?')[0] links.append(link) #print(link) photos = [] container_photo = soup.select('figure.listing-main-image a img') for link_photo in container_photo: photos.append(link_photo['src']) #print(link_photo['src']) list_price = [] container_text = soup.find_all("a", attrs={ "class" : "js-click-handler listing-fpa-link listings-price-link tracking-standard-link"}) for i in container_text: pr = i.find_all("div", attrs={ "class" : "vehicle-price"}) str_price = "".join((re.findall(r'[0-9]{,3},[0-9]{,3}', str(pr)))) price =27*int(str_price.replace(',', '')) list_price.append(price) for n, l, f, p in zip(names_list, links, photos, list_price): db_session.add(Cars(n, l, f, p)) db_session.commit() data = '{}{}{}{}{}'.format(n, '\t', l, '\t', f, '\t',str(p)) # parse with selenium # rows = driver.find_elements_by_css_selector("tr") # if not rows: # self.log.debug(f'{wnum}: not rows in table') # raw_msg.nack(requeue=True) # break # # for row in rows: # cells = row.find_elements_by_css_selector("td") # if not cells: # continue # # data = { # 'img_url': cells[0].find_element_by_css_selector( # 'img').get_attribute('src'), # 'country': cells[1].find_element_by_css_selector( # 'span').get_attribute('title'), # 'vessel_name': cells[1].text.split('\n')[0], # 'vessel_type': cells[1].text.split('\n')[1], # 'year': cells[2].text, # 'gt': cells[3].text, # 'dwt': cells[4].text, # 'sz': cells[5].text # } # vlength, vwidth = [int(v.strip()) for v in data['sz'].split('/')] self.log.debug(data) # db_connection.insert_ship(car) # db_connection.exec_query(f''' # INSERT INTO pages (page_num) # VALUES({msg['num']}) # ''') db_session.add(Pages(msg['num'])) db_session.commit() raw_msg.ack() except Exception as e0: self.log.exception()(f'{wnum}: get page error: {e0}')##self.log.error raw_msg.nack(requeue=True) prox = None if USE_PROXY: self.proxy_gen.back_proxy(prox, str(e0)) # driver.close() # driver, prox = self.init_browser() time.sleep(random.randrange(1, 5)) rab_connection.close() # db_connection.close() self.log.info(f'{wnum}: worker exit')