Esempio n. 1
0
class VesselCrawler:
    def __init__(self, ex_ev):
        self.log = init_logger(self.__class__.__name__)
        if USE_PROXY:
            self.proxy_gen = ProxyManager(self.log,
                                          ok_timeout=30,
                                          ban_timeout=1000)
        self.workers = []
        self.exit_event = ex_ev

    def run(self):
        for wnum in range(NUM_WORKERS):
            worker = threading.Thread(target=self.work,
                                      args=(wnum, ),
                                      daemon=True)
            self.workers.append(worker)

        for w in self.workers:
            w.start()

        while not self.exit_event.is_set():
            count_alive = [int(w.is_alive()) for w in self.workers]

            self.log.debug(f'VesselCrawler is working: {count_alive} of '
                           f'{NUM_WORKERS} is alive')
            if self.exit_event.wait(30):
                break
        for w in self.workers:
            w.join()
        self.log.info('VesselCrawler exit run')

    def init_browser(self):
        prox = None
        driver = None
        while not self.exit_event.is_set():
            if USE_PROXY:
                prox = self.proxy_gen.next_proxy()

                self.log.debug(f'try proxy: {prox}')
                try:
                    status = prox.check_proxy()
                except Exception as e0:
                    self.log.debug(f'bad proxy: {prox}: {e0}')
                    self.proxy_gen.back_proxy(prox, str(e0))
                    continue
                self.log.debug(f'proxy: {prox}: {status}: OK')

            # setup chrome options
            # https://www.andressevilla.com/running-chromedriver-with-python-selenium-on-heroku/
            chrome_options = Options()
            # chrome_options.binary_location = "/path/to/chrome.exe"
            chrome_options.add_argument("--incognito")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--no-sandbox")

            # chrome_options.add_argument("--user-data-dir="/path/to/profile")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options = webdriver.ChromeOptions()
            if IS_HEADLESS:
                chrome_options.add_argument('headless')
            chrome_options.add_argument('--no-sandbox')

            if USE_PROXY:
                proxy_str = '--proxy-server=https://{0}:{1}'.format(
                    prox.ip, str(prox.port))
                chrome_options.add_argument(proxy_str)

            driver = webdriver.Chrome(chrome_options=chrome_options,
                                      executable_path=DRIVER_PATH)
            driver.implicitly_wait(10)
            break

        return driver, prox

    def work(self, wnum):
        self.log.debug(f'{wnum} worker started')
        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(self.log)
        driver, prox = self.init_browser()

        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()

            if 'url' not in msg:
                self.log.warning(f'{wnum}: bad task: {msg}')
                raw_msg.ack()
                continue

            if msg['num'] == 0:
                msg['url'] = msg['url'].split('?')[0]

            try:
                driver.get(msg['url'])

                self.log.debug(driver.current_url)
                time.sleep(3)

                # parse with selenium
                rows = driver.find_elements_by_css_selector("tr")
                if not rows:
                    self.log.debug(f'{wnum}: not rows in table')
                    raw_msg.nack(requeue=True)
                    break

                for row in rows:
                    cells = row.find_elements_by_css_selector("td")
                    if not cells:
                        continue

                    data = {
                        'img_url':
                        cells[0].find_element_by_css_selector(
                            'img').get_attribute('src'),
                        'country':
                        cells[1].find_element_by_css_selector(
                            'span').get_attribute('title'),
                        'vessel_name':
                        cells[1].text.split('\n')[0],
                        'vessel_type':
                        cells[1].text.split('\n')[1],
                        'year':
                        cells[2].text,
                        'gt':
                        cells[3].text,
                        'dwt':
                        cells[4].text,
                        'sz':
                        cells[5].text
                    }
                    vlength, vwidth = [
                        int(v.strip()) for v in data['sz'].split('/')
                    ]
                    self.log.debug(data)
                    ship = Ship(
                        sid=None,
                        name=data['vessel_name'],
                        country_name=data['country'],
                        description=f'{data["vessel_type"]}, {data["img_url"]}',
                        built_year=data['year'],
                        length=vlength,
                        width=vwidth,
                        gt=data['gt'],
                        dwt=data['dwt'])
                    db_connection.insert_ship(ship)
                db_connection.exec_query(f'''
                    INSERT INTO pages (page_num)
                    VALUES({msg['num']})
                ''')
                raw_msg.ack()
            except Exception as e0:
                self.log.error(f'{wnum}: get page error: {e0}')
                raw_msg.nack(requeue=True)
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))
                driver.close()
                driver, prox = self.init_browser()
            time.sleep(random.randrange(1, 5))

        rab_connection.close()
        db_connection.close()
        self.log.info(f'{wnum}: worker exit')
Esempio n. 2
0
class VesselCrawlerLxml:
    def __init__(self, ex_ev):
        if USE_PROXY:
            self.proxy_gen = ProxyManager(
                log=None, ok_timeout=30, ban_timeout=1000)
        self.workers = []
        self.exit_event = ex_ev

    def run(self):
        for wnum in range(NUM_WORKERS):
            worker = threading.Thread(
                target=self.work, args=(wnum,), daemon=True
            )
            self.workers.append(worker)

        for w in self.workers:
            w.start()

        while not self.exit_event.is_set():
            count_alive = [int(w.is_alive()) for w in self.workers]


            if self.exit_event.wait(30):
                break
        for w in self.workers:
            w.join()


    def init_browser(self):
        prox = None
        driver = None
        while not self.exit_event.is_set():
            if USE_PROXY:
                prox = self.proxy_gen.next_proxy()
                try:
                    status = prox.check_proxy()
                except Exception as e0:

                    self.proxy_gen.back_proxy(prox, str(e0))
                    continue


            # setup chrome options
            # https://www.andressevilla.com/running-chromedriver-with-python-selenium-on-heroku/
            chrome_options = Options()
            #chrome_options.binary_location = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
            chrome_options.add_argument("--incognito")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--no-sandbox")
            # chrome_options.add_argument("--user-data-dir="/path/to/profile")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options = webdriver.ChromeOptions()
            if IS_HEADLESS:
                chrome_options.add_argument('headless')
            chrome_options.add_argument('--no-sandbox')

            if USE_PROXY:
                proxy_str = '--proxy-server=https://{0}:{1}'.format(
                    prox.ip, str(prox.port))
                chrome_options.add_argument(proxy_str)

            driver = webdriver.Chrome(chrome_options=chrome_options,
                                      executable_path=DRIVER_PATH)
            driver.implicitly_wait(10)
            break

        return driver, prox

    def work(self, wnum):

        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(logger=None)
        driver, prox = self.init_browser()

        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()

            if 'url' not in msg:

                raw_msg.ack()
                continue

            if msg['num'] == 0:
                msg['url'] = msg['url'].split('?')[0]

            try:
                driver.get(msg['url'])


                time.sleep(3)

                html = driver.page_source
                dom = lxml_html.fromstring(html)

                # parse with selenium
                rows = dom.cssselect("tr")
                if not rows:

                    raw_msg.nack(requeue=True)
                    break

                for row in rows:
                    cells = row.cssselect("td")
                    if not cells:
                        continue

                    data = {
                        'img_url': cells[0].cssselect('img')[0].get('src'),
                        'country': cells[1].cssselect('span')[0].get('title'),
                        'vessel_name': cells[1].cssselect('a')[0].text_content().strip(),
                        'vessel_type': cells[1].cssselect('small')[0].text_content().strip(),
                        'year': cells[2].text_content(),
                        'gt': cells[3].text_content(),
                        'dwt': cells[4].text_content(),
                        'sz': cells[5].text_content()
                    }
                    vlength, vwidth = [int(v.strip()) for v in data['sz'].split('/')]

                    ship = Ship(
                        sid=None,
                        name=data['vessel_name'],
                        country_name=data['country'],
                        description=f'{data["vessel_type"]}, {data["img_url"]}',
                        built_year=data['year'],
                        length=vlength,
                        width=vwidth,
                        gt=data['gt'],
                        dwt=data['dwt']
                    )
                    db_connection.insert_ship(ship)
                    print(12121)
                db_connection.exec_query(f'''
                    INSERT INTO pages (page_num)
                    VALUES({msg['num']})
                ''')
                raw_msg.ack()
            except Exception as e0:

                raw_msg.nack(requeue=True)
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))
                try:
                    driver.close()
                except:
                    pass
                if not self.exit_event.is_set():
                    driver, prox = self.init_browser()
            time.sleep(random.randrange(1, 5))

        try:
            rab_connection.close()
            db_connection.close()
            driver.close()
        except:
            pass
Esempio n. 3
0
class CarCrawler:
    def __init__(self, ex_ev):
        self.log = init_logger(self.__class__.__name__)
        if USE_PROXY:
            self.proxy_gen = ProxyManager(self.log,
                                          ok_timeout=30,
                                          ban_timeout=1000)
        self.workers = []
        self.exit_event = ex_ev

    def run(self):
        for wnum in range(NUM_WORKERS):
            worker = threading.Thread(target=self.work,
                                      args=(wnum, ),
                                      daemon=True)
            self.workers.append(worker)

        for w in self.workers:
            w.start()

        while not self.exit_event.is_set():
            count_alive = [int(w.is_alive()) for w in self.workers]

            self.log.debug(f'CarCrawler is working: {count_alive} of '
                           f'{NUM_WORKERS} is alive')
            if self.exit_event.wait(30):
                break
        for w in self.workers:
            w.join()
        self.log.info('CarCrawler exit run')

    def work(self, wnum):
        self.log.debug(f'{wnum} worker started')
        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(logger=None)
        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()
            print(msg)

            if 'url' not in msg:
                self.log.warning(f'{wnum}: bad task: {msg}')
                raw_msg.ack()
                continue
            print()
            if msg['num'] == 0:
                msg['url'] = PAGE_URL0
                print("0", msg)

            try:
                request = requests.get(msg['url'], headers=HEADERS).content
                soup = BeautifulSoup(request, 'html.parser')

                self.log.debug(msg['url'])
                time.sleep(1)

                names_list = []
                container_names = soup.select('div.information-container h2 a')
                for name in container_names:
                    str_name = name.text
                    print(str_name)
                    names_list.append(str_name)

                links = []
                container_links = soup.select('div.information-container h2 a')
                for i in container_links:
                    ii = i['href'].split("&")[0]
                    full_link = ("https://www.autotrader.co.uk" + ii)
                    link = full_link.split('?')[0]
                    links.append(link)

                photos = []
                container_photo = soup.select(
                    'figure.listing-main-image a img')
                for link_photo in container_photo:
                    photos.append(link_photo['src'])

                list_price = []
                container_text = soup.find_all(
                    "a",
                    attrs={
                        "class":
                        "js-click-handler listing-fpa-link listings-price-link tracking-standard-link"
                    })
                for i in container_text:
                    pr = i.find_all("div", attrs={"class": "vehicle-price"})
                    str_price = "".join((re.findall(r'[0-9]{,3},[0-9]{,3}',
                                                    str(pr))))
                    price = 27 * int(str_price.replace(',', ''))
                    list_price.append(price)

                for n, l, f, p in zip(names_list, links, photos, list_price):

                    db_session.add(Cars(n, l, f, p))
                    db_session.commit()

                    data = '{}{}{}{}{}'.format(n, '\t', l, '\t', f, '\t',
                                               str(p))
                    self.log.debug(data)

                db_session.add(Pages(msg['num']))
                db_session.commit()
                raw_msg.ack()
            except Exception as e0:
                self.log.exception()(
                    f'{wnum}: get page error: {e0}')  ##self.log.error
                raw_msg.nack(requeue=True)
                prox = None
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))

            time.sleep(random.randrange(1, 5))

        rab_connection.close()
        self.log.info(f'{wnum}: worker exit')
Esempio n. 4
0
class CarCrawler:
    def __init__(self, ex_ev):
        self.log = init_logger(self.__class__.__name__)
        if USE_PROXY:
            self.proxy_gen = ProxyManager(
                self.log, ok_timeout=30, ban_timeout=1000)
        self.workers = []
        self.exit_event = ex_ev

    def run(self):
        for wnum in range(NUM_WORKERS):
            worker = threading.Thread(
                target=self.work, args=(wnum,), daemon=True
            )
            self.workers.append(worker)

        for w in self.workers:
            w.start()

        while not self.exit_event.is_set():
            count_alive = [int(w.is_alive()) for w in self.workers]

            self.log.debug(f'CarCrawler is working: {count_alive} of '
                           f'{NUM_WORKERS} is alive')
            if self.exit_event.wait(30):
                break
        for w in self.workers:
            w.join()
        self.log.info('CarCrawler exit run')


    def work(self, wnum):
        self.log.debug(f'{wnum} worker started')
        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(logger=None)
        # driver, prox = self.init_browser()
        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()
            print(msg)

            if 'url' not in msg:
                self.log.warning(f'{wnum}: bad task: {msg}')
                raw_msg.ack()
                continue
            print()
            if msg['num'] == 0:
                msg['url'] = PAGE_URL0
                # msg['url'] = msg['url'].split('?')[0]
                print("0",msg)

            try:
                # driver.get(msg['url'])
                request = requests.get(msg['url'], headers=HEADERS).content
                soup = BeautifulSoup(request, 'html.parser')
                # container = soup.select("li.search-page__result")

                self.log.debug(msg['url'])
                # self.log.debug(driver.current_url)
                time.sleep(1)

                names_list = []
                container_names = soup.select('div.information-container h2 a')
                for name in container_names:
                    str_name = name.text
                    #name = str_name.strip()
                    print(str_name)
                    names_list.append(str_name)

                links = []
                container_links = soup.select('div.information-container h2 a')
                for i in container_links:
                    ii = i['href'].split("&")[0]
                    # ii = i['href']
                    full_link = ("https://www.autotrader.co.uk" + ii)
                    link = full_link.split('?')[0]
                    links.append(link)
                    #print(link)

                photos = []
                container_photo = soup.select('figure.listing-main-image a img')
                for link_photo in container_photo:
                    photos.append(link_photo['src'])
                    #print(link_photo['src'])

                list_price = []
                container_text = soup.find_all("a", attrs={ "class" : "js-click-handler listing-fpa-link listings-price-link tracking-standard-link"})
                for i in container_text:
                    pr = i.find_all("div", attrs={ "class" : "vehicle-price"})
                    str_price = "".join((re.findall(r'[0-9]{,3},[0-9]{,3}', str(pr))))
                    price =27*int(str_price.replace(',', ''))
                    list_price.append(price)

                for n, l, f, p in zip(names_list, links, photos, list_price):

                    db_session.add(Cars(n, l, f, p))
                    db_session.commit()

                    data = '{}{}{}{}{}'.format(n, '\t', l, '\t', f, '\t',str(p))
                                                                    # parse with selenium
                                                                    # rows = driver.find_elements_by_css_selector("tr")
                                                                    # if not rows:
                                                                    #     self.log.debug(f'{wnum}: not rows in table')
                                                                    #     raw_msg.nack(requeue=True)
                                                                    #     break
                                                                    #
                                                                    # for row in rows:
                                                                    #     cells = row.find_elements_by_css_selector("td")
                                                                    #     if not cells:
                                                                    #         continue
                                                                    #
                                                                    #     data = {
                                                                    #         'img_url': cells[0].find_element_by_css_selector(
                                                                    #             'img').get_attribute('src'),
                                                                    #         'country': cells[1].find_element_by_css_selector(
                                                                    #             'span').get_attribute('title'),
                                                                    #         'vessel_name': cells[1].text.split('\n')[0],
                                                                    #         'vessel_type': cells[1].text.split('\n')[1],
                                                                    #         'year': cells[2].text,
                                                                    #         'gt': cells[3].text,
                                                                    #         'dwt': cells[4].text,
                                                                    #         'sz': cells[5].text
                                                                    #     }
                                                                    #     vlength, vwidth = [int(v.strip()) for v in data['sz'].split('/')]
                    self.log.debug(data)


                                                    #     db_connection.insert_ship(car)
                                                    # db_connection.exec_query(f'''
                                                    #     INSERT INTO pages (page_num)
                                                    #     VALUES({msg['num']})
                                                    # ''')
                db_session.add(Pages(msg['num']))
                db_session.commit()
                raw_msg.ack()
            except Exception as e0:
                self.log.exception()(f'{wnum}: get page error: {e0}')##self.log.error
                raw_msg.nack(requeue=True)
                prox = None
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))
                # driver.close()
                # driver, prox = self.init_browser()
            time.sleep(random.randrange(1, 5))

        rab_connection.close()
        # db_connection.close()
        self.log.info(f'{wnum}: worker exit')