예제 #1
0
 def _get(self, url, sub_dir):
     capture = Capture(sub_dir, self.conf)
     browser = Browser(url, sub_dir, self.conf)
     links = []
     try:
         capture.run()
         sleep(3)
         html = browser.get()
         links = self._get_links(html)
         sleep(30)
     except WebDriverException:
         self._create_exception_file(traceback.format_exc(), sub_dir)
     except KeyboardInterrupt:
         self._create_exception_file(traceback.format_exc(), sub_dir)
     finally:
         capture.kill()
         browser.close()
     return links
예제 #2
0
def setup(request, setUpClass):
    print("initiating chrome driverd")
    driver = Browser().getbrowser("chrome")
    url = URL()
    driver.get(url.webViewerUAT)
    utility = Utility()
    # utility.createLogFolder()
    log = open(utility.logpath + "/WV-00.txt", "a+")
    driverUtility = DriverUtility(driver, log)
    loginPageObject = LoginPageObject(driverUtility, log)

    request.cls.driver = driver
    request.cls.url1 = url
    request.cls.utility = utility
    request.cls.driverUtility = driverUtility
    request.cls.loginPageObject = loginPageObject

    print("setup ended")
    yield driver
    driver.close()
예제 #3
0
class WV00(unittest.TestCase):

    # @pytest.fixture(scope="module")
    def setUpClass(self):
        self.utility = Utility()
        # CHANGE THE LOG FILE NAME IN THE NEXT LINE******************************************************************************************
        self.log = open(self.utility.logpath + "/WV-00.txt", "a+")
        self.suite_start_time = time.time()
        self.log.write("Suite started at {}\n".format(
            str(time.ctime(int(self.suite_start_time)))))
        self.url = URL()
        self.loginPageStaticTexts = LoginPageStaticText()
        self.loginPageTestData = LoginPageTestData()
        self.configTestCase = configparser.RawConfigParser()
        # CHANGE THE CONFIG PROPERTY FILE NAME IN THE NEXT LINE******************************************************************************************
        self.configTestCase.read(
            os.path.dirname(os.getcwd()) +
            '/TestCases/WV_00_Config.properties')
        self.configECG = configparser.RawConfigParser()
        self.configECG.read(
            os.path.dirname(os.getcwd()) +
            '/Scripts/ECGRelatedData.properties')
        self.configDevice = configparser.RawConfigParser()
        self.configDevice.read(
            os.path.dirname(os.getcwd()) +
            '/Scripts/DeviceRelatedData.properties')
        self.sendECG = SendECG()
        yield
        self.suite_end_time = time.time()
        self.total_time_taken_suite = self.suite_end_time - self.suite_start_time
        self.log.write("Suite ended at {}\n".format(
            str(time.ctime(int(self.suite_end_time)))))
        self.log.write(
            "Total time taken by Test Suite to finish: {} seconds\n".format(
                self.total_time_taken_suite))
        self.log.close()

    @pytest.fixture()
    def setUp(self):
        self.driver = Browser().getbrowser("chrome")
        self.driver.get(self.url.webViewerUAT)
        self.driverUtility = DriverUtility(self.driver, self.log)
        self.loginPageObject = LoginPageObject(self.driverUtility, self.log)
        yield
        self.driver.close()

    # WV-00-000
    # Description :
    # Procedure/Protocol :
    # - Generate a test ECG
    # Acceptance Criteria :

    def test_WV00(self, setUpClass, setUp):
        testCaseID = "WV-00-000"
        if self.configTestCase.get(testCaseID, 'Enabled') == "Yes":
            startTime = self.driverUtility.startTestCase(testCaseID)
            try:
                # code here for test cases without ECGs********************************************************************************
                self.driverUtility.passedTestCase(testCaseID, startTime)

            except AssertionError as error:
                self.driverUtility.failedTestCase(testCaseID)
            except Exception as e:
                self.driverUtility.erroredTestCase(testCaseID)
        else:
            self.log.write("Test case {} not enabled\n".format(testCaseID))
            self.log.write(
                "********************************************************************************\n"
            )
예제 #4
0
class NhaDat247(CrawlerObject):

    BASE_URL = "https://nhadat247.com.vn/"
    SAVE_CHECK_POINT = 5

    def __init__(self,
                 date_from=None,
                 date_to=None,
                 post_type=None,
                 all_date: bool = False,
                 resume=False,
                 limit=-1):

        self.limit = int(limit)
        self.db_object = DBObject()
        the_status = "crawling"
        worker_info = self.db_object.query_wokers_info(Settings.worker_id)
        self.resume = resume
        if self.resume:
            try:
                info_ = worker_info
                status_ = info_["status"]
                task_id = info_["task_id"]
                info_str_ = info_["str_info"]
                if not ("(pause)" in status_ and "crawling" in status_):
                    print(">>", status_)
                    return
                info_dict_ = {
                    _i_.split(": ")[0]: _i_.split(": ")[1]
                    for _i_ in info_str_.lower().split(", ")
                }
                if info_dict_["site"] != "nhadat247.com.vn":
                    return
                date_from = info_dict_["date"].split("-")[0]
                date_to = info_dict_["date"].split("-")[1]

                try:
                    self.limit = int(info_dict_["limit"])
                except:
                    self.limit = -1

                post_type = info_dict_["type"]
                the_status = status_.replace("(pause)", "")
                print("Internal loading data to resume")
            except:
                traceback.print_exc()
                return

        self.__str_info = "Site: nhadat247.com.vn, Type: %s, Date: %s-%s, Limit: %s, " % (
            post_type, date_from, date_to, str(self.limit)
            if isinstance(self.limit, int) and self.limit > 0 else "No")
        self.__str_info += "Numpost: %d, Error: %d"

        self.post_type = post_type
        self.buffer = []
        self.seed_url = NhaDat247.get_seed_url(post_type)

        self.__current_url = ""
        self.__failed_urls = []
        self.__saved_post = []

        self.file_log_visited_url = "visited_post_log_nhadat247_%s.txt" % (
            self.post_type)
        self.file_log_new_url = "local_urls_log_nhadat247_%s.txt" % (
            self.post_type)

        self.regex_sub_url = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z]+((.html)|(/[0-9]+))?")
        self.regex_post = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z0-9]+/[-a-z0-9]+pr[0-9]+.html")

        self.key_type = NhaDat247.get_key_from_type(self.post_type)

        try:
            last_day_to = calendar.monthrange(int(date_to.split("/")[1]),
                                              int(date_to.split("/")[0]))[1]
            self.post_date_range = {
                "from":
                datetime.strptime("1/" + date_from, '%d/%m/%Y').date(),
                "to":
                datetime.strptime(
                    str(last_day_to) + "/" + date_to, '%d/%m/%Y').date()
            }
            print("-" * 200, "\n", self.post_date_range)
        except:
            traceback.print_exc()
            self.post_date_range = None

        self.browser = Browser(headless=False)

        if not self.resume:
            task_id = (int)(time.time())

        self.__crawling_info = {
            "task_id": task_id,
            "status": the_status,
            "str_info": ""
        }
        self.__crawling_log = {
            "worker_id": Settings.worker_id,
            "task_id": task_id,
            "task_info": self.__str_info % (0, 0),
            "saved_posts": [],
            "error_posts": []
        }

        if not self.resume:
            print("Create log")
            self.db_object.create_wokers_log(self.__crawling_log)
            self.update_crawling_status_info(0, 0)
        else:
            log = self.db_object.query_wokers_logs(Settings.worker_id, task_id)
            print("Get log: ", log if log else "null")
            if log is not None:
                self.__saved_post = log["saved_posts"]
                self.__failed_urls = log["error_posts"]

        print("Init crawler")

    def update_crawling_status_info(self, num_post, num_error):
        self.__crawling_info["str_info"] = self.__str_info % (num_post,
                                                              num_error)
        self.db_object.update_wokers_info(Settings.worker_id,
                                          self.__crawling_info)

    def update_crawling_log(self):
        self.db_object.update_wokers_log(Settings.worker_id,
                                         self.__crawling_log["task_id"],
                                         self.__saved_post, self.__failed_urls)

    def get_html_and_soup_from_url(self, url):
        """
        Return Beautifulsoup object
        """
        _soup = None
        _html = None
        for i in range(5):
            try:
                element_present = EC.presence_of_element_located(
                    (By.CSS_SELECTOR, "body > div.footer"))
                _html = self.browser.get_html(url, until_ec=element_present)
                _soup = BeautifulSoup(_html, 'html.parser')
                if _soup is not None:
                    return _html, _soup
            except Exception as e:
                traceback.print_exc()
                continue

        self.__failed_urls.append(self.__current_url)
        return None, None

    @staticmethod
    def get_key_from_type(key) -> list:
        if key == "land":
            return ["ban-dat"]
        elif key == "apartment":
            return ["ban-can-ho-chung-cu"]
        elif key == "house":
            return ["ban-nha-mat-pho", "ban-nha-biet-thu", "ban-nha-rieng"]

        return [
            "ban-dat", "ban-can-ho-chung-cu", "ban-nha-rieng",
            "ban-nha-mat-pho", "ban-nha-biet-thu"
        ]

    def check_type(self, url) -> bool:
        for key in self.key_type:
            if key in url:
                # print("ok")
                return True

        return False

    def append_data(self, _url, _type, _status, _crawl_date, _post_date,
                    _html):

        post = {}

        url_hash = hashlib.md5(_url.encode()).hexdigest()
        post["url_hash"] = url_hash
        post["url"] = _url
        post["type"] = _type
        post["status"] = _status
        post["html"] = _html
        post["date"] = _crawl_date
        post["post_date"] = _post_date
        self.__saved_post.append(url_hash)
        self.buffer.append(post)

        # post["html"] = "<html>"
        # print("-"*10,"\n",post)

    def load_init_url(self) -> tuple:
        local_urls = self.seed_url
        visited_post = []

        if self.resume:
            try:
                local_urls = list(open(self.file_log_new_url, "r").readlines())
            except:
                ""
            try:
                visited_post = list(
                    open(self.file_log_visited_url, "r").readlines())
            except:
                ""

        return local_urls, visited_post

    def get_date(self, page_soup: BeautifulSoup) -> date:
        post_date = None
        try:
            str_date = page_soup.select_one(
                "#ContentPlaceHolder1_ProductDetail1_divprice > div").get_text(
                ).split("|")[1]
            str_date = slugify(str_date.strip().lower())
            if "hom-kia" in str_date:
                post_date = date.today() - timedelta(days=2)
            elif "hom-qua" in str_date:
                post_date = date.today() - timedelta(days=1)
            elif "hom-nay" in str_date:
                post_date = date.today()
            else:

                post_date = datetime.strptime(str_date, '%d-%m-%Y').date()

        except Exception as e:
            self.__failed_urls.append(self.__current_url)
            traceback.print_exc()
        return post_date

    def visit(self, current_url) -> tuple:
        local_urls = []
        post_date = None
        page_source, page_soup = self.get_html_and_soup_from_url(current_url)

        if page_soup:

            is_post = re.search(self.regex_post, current_url)
            if is_post:
                print("Is a post")
                post_date = self.get_date(page_soup)
                if not self.post_date_range or \
                    (isinstance(post_date, date) and (self.post_date_range["from"] <= post_date <= self.post_date_range["to"])):
                    post_date = post_date.strftime('%d/%m/%Y')
                else:
                    page_source = None

            else:
                page_source = None

            list_href = page_soup.find_all('a')

            for link in list_href:
                anchor = str(link.get('href'))
                if not bool(urlparse(anchor).netloc):
                    anchor = urljoin(self.BASE_URL, anchor)

                if validators.url(anchor) and self.check_type(anchor) and (
                        self.regex_post.search(anchor)
                        or self.regex_sub_url.search(anchor)):
                    local_urls.append(anchor)

        print("<html>" if page_source else "None")
        return page_source, post_date, local_urls

    def obtain_data(self):

        print("START...")
        num_visited = 0
        local_urls, visited_post = self.load_init_url()
        post_count = len(self.__saved_post)
        while local_urls:
            self.__current_url = local_urls.pop(0)

            if len(self.__current_url) < 10 and (
                    self.__current_url in visited_post
                    or not self.check_type(self.__current_url)):
                continue

            print(" > ", self.__current_url)

            page_source, post_date, new_urls_to_visit = self.visit(
                self.__current_url)

            visited_post.append(self.__current_url)
            local_urls += new_urls_to_visit

            if page_source:
                post_count += 1
                self.append_data(_url=self.__current_url,
                                 _type="post",
                                 _status="0",
                                 _html=page_source,
                                 _crawl_date=str(
                                     date.today().strftime("%d/%m/%Y")),
                                 _post_date=post_date)

            # check-point to save buffer data
            if num_visited % self.SAVE_CHECK_POINT == 0:
                self.save_data()
                self.update_crawling_status_info(post_count,
                                                 len(self.__failed_urls))
                self.update_crawling_log()

                NhaDat247.save_list(local_urls, self.file_log_new_url)
                NhaDat247.save_list(visited_post, self.file_log_visited_url)

            num_visited += 1
            print("  >> num: ", post_count)
            if self.limit > 0 and post_count >= self.limit:
                break

        # finishing
        self.save_data()
        self.update_crawling_status_info(post_count, len(self.__failed_urls))
        self.update_crawling_log()
        self.browser.close()
        print('CRAWLING DONE')

    def rotate_ip(self, enable=False):
        self.browser.set_rotate_ip(enable)
        return

    def save_data(self):
        self.db_object.insert_html_data(self.buffer, many=True)
        # clear buffer
        self.buffer = []

    def get_seed_url(post_type):
        data = {
            "apartment": ["https://nhadat247.com.vn/ban-can-ho-chung-cu.html"],
            "house": [
                "https://nhadat247.com.vn/ban-nha-rieng.html",
                "https://nhadat247.com.vn/ban-nha-biet-thu-lien-ke.html",
                "https://nhadat247.com.vn/ban-nha-mat-pho.html"
            ],
            "land": [
                "https://nhadat247.com.vn/ban-dat-nen-du-an.html",
                "https://nhadat247.com.vn/ban-dat.html"
            ]
        }
        return data[post_type] if post_type in data else [
            url for e in data for url in data[e]
        ]

    def save_list(data: list, file_name):
        print("Checkpoint: ", file_name)
        with open(file_name, 'w') as file:
            file.write("\n".join(set(data)))
            file.close()
예제 #5
0
def setup(request, setUpClass):
    print("initiating chrome driverd")
    driver = Browser().getbrowser("chrome")
    url = URL()
    driver.get(url.webViewerUAT)
    utility = Utility()
    # utility.createLogFolder()
    log = open(utility.logpath + "/WV-00.txt", "a+")
    driverUtility = DriverUtility(driver, log)
    loginPageObject = LoginPageObject(driverUtility, log)

    request.cls.driver = driver
    request.cls.url1 = url
    request.cls.utility = utility
    request.cls.driverUtility = driverUtility
    request.cls.loginPageObject = loginPageObject

    print("setup ended")
    yield driver
    driver.close()


# from datetime import datetime

# def pytest_logger_config(logger_config):

#     logger_config.add_loggers(['foo', 'bar', 'baz'], stdout_level='debug')
#     logger_config.set_log_option_default('foo,bar')

# def pytest_logger_logdirlink(config):
# 	print("1")
# 	path = os.path.dirname(os.getcwd()) + '/Logs/'
# 	foldername = datetime.now().strftime("%Y%m%d-%H%M%S")
# 	logpath = path+foldername
# 	try:
# 		# return os.mkdir(logpath)
# 		return os.path.join(path, foldername)
# 		# return logpath
# 	except OSError as e:
# 		print("Creation of the directory failed")
# 		print(traceback.format_exc())
# 	else:
# 		print("Successfully created the directory")

# return os.path.join(os.path.dirname(__file__), 'mylogs')

# @pytest.yield_fixture(scope='session')
# def session_thing():
#     foo.debug('constructing session thing')
#     yield
#     foo.debug('destroying session thing')

# @pytest.yield_fixture
# def testcase_thing():
#     foo.debug('constructing testcase thing')
#     yield
#     foo.debug('destroying testcase thing')

# @pytest.fixture(scope="class")
# def setup(request):
#     print("initiating chrome driver")
#     driver = Browser().getbrowser("chrome") #if not added in PATH
#     url = URL()
#     utility = Utility()

#     # driver.maximize_window()
#     request.cls.d = driver
#     request.cls.u = utility
#     request.cls.url1 = url
#     yield
#     driver.close()

# import pytest
# from selenium import webdriver

# @pytest.fixture(scope="class")
# def setup(request):
#     print("initiating chrome driver")
#     driver = Browser().getbrowser("chrome") #if not added in PATH
#     url = URL()
#     utility = Utility()
#     # driver.maximize_window()
#     request.cls.d = driver
#     request.cls.u = utility
#     request.cls.url1 = url

#     yield driver
#     driver.close()

# @pytest.fixture(scope='session')
# def config():
# 	with open('WV_00_Config.json') as config_file:
# 		data = json.load(config_file)
# 		for r in data['Enabled']:
# 			print (r[b])
# 	return data
예제 #6
0
class ChoTotCrawler(CrawlerObject):

    BASE_URL = "https://nha.chotot.com/"
    SAVE_CHECK_POINT = 5

    def __init__(self,
                 date_from=None,
                 date_to=None,
                 post_type=None,
                 all_date: bool = False,
                 resume=False,
                 limit=-1):

        self.limit = int(limit)
        self.db_object = DBObject()
        the_status = "crawling"
        worker_info = self.db_object.query_wokers_info(Settings.worker_id)
        self.resume = resume
        if self.resume:
            try:
                info_ = worker_info
                status_ = info_["status"]
                task_id = info_["task_id"]
                info_str_ = info_["str_info"]
                if not ("(pause)" in status_ and "crawling" in status_):
                    print(">>", status_)
                    return
                info_dict_ = {
                    _i_.split(": ")[0]: _i_.split(": ")[1]
                    for _i_ in info_str_.lower().split(", ")
                }
                if info_dict_["site"] != "nha.chotot.com":
                    return
                date_from = info_dict_["date"].split("-")[0]
                date_to = info_dict_["date"].split("-")[1]

                try:
                    self.limit = int(info_dict_["limit"])
                except:
                    self.limit = -1

                post_type = info_dict_["type"]
                the_status = status_.replace("(pause)", "")
                print("Internal loading data to resume")
            except:
                traceback.print_exc()
                return

        self.__str_info = "Site: nha.chotot.com, Type: %s, Date: %s-%s, Limit: %s, " % (
            post_type, date_from, date_to, str(self.limit)
            if isinstance(self.limit, int) and self.limit > 0 else "No")
        self.__str_info += "Numpost: %d, Error: %d"

        self.post_type = post_type
        self.buffer = []
        self.seed_url = ChoTotCrawler.get_seed_url(post_type)

        self.__current_url = ""
        self.__failed_urls = []
        self.__saved_post = []

        self.file_log_visited_url = "visited_post_log_chotot_%s.txt" % (
            self.post_type)
        self.file_log_new_url = "local_urls_log_chotot_%s.txt" % (
            self.post_type)

        self.regex_sub_url = re.compile(
            "([a-z][-a-z]*)?ban-[-a-z]+((.htm)|(/[0-9]+))?")
        self.regex_post = re.compile(
            "([a-z][-a-z]+)?[/][a-z][-a-z0-9]+/[-a-z0-9]+.htm")

        self.key_type = ChoTotCrawler.get_key_from_type(self.post_type)

        try:
            last_day_to = calendar.monthrange(int(date_to.split("/")[1]),
                                              int(date_to.split("/")[0]))[1]
            self.post_date_range = {
                "from":
                datetime.strptime("1/" + date_from, '%d/%m/%Y').date(),
                "to":
                datetime.strptime(
                    str(last_day_to) + "/" + date_to, '%d/%m/%Y').date()
            }
            print("-" * 200, "\n", self.post_date_range)
        except:
            traceback.print_exc()
            self.post_date_range = None

        self.browser = Browser(headless=False)

        if not self.resume:
            task_id = (int)(time.time())

        self.__crawling_info = {
            "task_id": task_id,
            "status": the_status,
            "str_info": ""
        }
        self.__crawling_log = {
            "worker_id": Settings.worker_id,
            "task_id": task_id,
            "task_info": self.__str_info % (0, 0),
            "saved_posts": [],
            "error_posts": []
        }

        if not self.resume:
            print("Create log")
            self.db_object.create_wokers_log(self.__crawling_log)
            self.update_crawling_status_info(0, 0)
        else:
            log = self.db_object.query_wokers_logs(Settings.worker_id, task_id)
            print("Get log: ", log if log else "null")
            if log is not None:
                self.__saved_post = log["saved_posts"]
                self.__failed_urls = log["error_posts"]

        print("Init crawler")

    def update_crawling_status_info(self, num_post, num_error):
        self.__crawling_info["str_info"] = self.__str_info % (num_post,
                                                              num_error)
        self.db_object.update_wokers_info(Settings.worker_id,
                                          self.__crawling_info)

    def update_crawling_log(self):
        self.db_object.update_wokers_log(Settings.worker_id,
                                         self.__crawling_log["task_id"],
                                         self.__saved_post, self.__failed_urls)

    def get_html_and_soup_from_url(self, url):
        """
        Return Beautifulsoup object
        """
        _soup = None
        _html = None
        click_phone_script = """
            function getElementByXpath(path) {
                return document.evaluate(path, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;
            }

            var phone = getElementByXpath("//*[@id='__next']/div[3]/div[1]/div/div[4]/div[3]/div/linkcontact");
            if (phone != null) {
                phone.click();
            }                    
        """

        for i in range(5):
            try:
                is_post = re.search(self.regex_post, url)
                element_present = EC.presence_of_element_located(
                    (By.XPATH, """//html/body/div[1]/footer"""))
                _html = self.browser.get_html(
                    url=url,
                    until_ec=element_present,
                    run_script=click_phone_script if is_post else None)
                _soup = BeautifulSoup(_html, 'html.parser')
                if _soup is not None:
                    return _html, _soup
            except Exception as e:
                traceback.print_exc()
                continue

        self.__failed_urls.append(self.__current_url)
        return None, None

    @staticmethod
    def get_key_from_type(key) -> list:
        if key == "land":
            return ["mua-ban-dat"]
        elif key == "apartment":
            return ["mua-ban-can-ho-chung-cu"]
        elif key == "house":
            return ["mua-ban-nha-dat"]

        return ["mua-ban-dat", "mua-ban-nha-dat", "mua-ban-can-ho-chung-cu"]

    def check_type(self, url) -> bool:
        for key in self.key_type:
            if key in url:
                # print("ok")
                return True

        return False

    def append_data(self, _url, _type, _status, _crawl_date, _post_date,
                    _html):

        post = {}

        url_hash = hashlib.md5(_url.encode()).hexdigest()
        post["url_hash"] = url_hash
        post["url"] = _url
        post["type"] = _type
        post["status"] = _status
        post["html"] = _html
        post["date"] = _crawl_date
        post["post_date"] = _post_date
        self.__saved_post.append(url_hash)
        self.buffer.append(post)

        # post["html"] = "<html>"
        # print("-"*10,"\n",post)

    def load_init_url(self) -> tuple:
        local_urls = self.seed_url
        visited_post = []

        if self.resume:
            try:
                local_urls = list(open(self.file_log_new_url, "r").readlines())
            except:
                ""
            try:
                visited_post = list(
                    open(self.file_log_visited_url, "r").readlines())
            except:
                ""

        return local_urls, visited_post

    def convert_str2date(date_str):
        _date = None

        date_str = slugify(date_str.lower())
        _l = date_str.split("-")
        if "hom-qua" in date_str:
            _date = date.today() - timedelta(days=1)
        elif "thang" in _l:
            _n = int(_l[_l.index("thang") - 1][0])
            _date = date.today() - timedelta(days=30 * _n)
        elif "tuan" in _l:
            _n = int(_l[_l.index("tuan") - 1][0])
            _date = date.today() - timedelta(days=7 * _n)
        elif "ngay" in _l:
            _n = int(_l[_l.index("ngay") - 1][0])
            _date = date.today() - timedelta(days=1)
        elif "hom-nay" in date_str or "gio" in _l or "phut" in _l:
            _date = date.today()
        else:
            _date = datetime.strptime(date_str, '%d/%m/%Y').date()

        return _date

    def get_date(self, page_soup: BeautifulSoup) -> date:
        post_date = None
        try:
            str_date = page_soup.select_one(
                "#__next > div > div.ct-detail.adview > div > div.col-md-8 > div.adImageWrapper___KTd-h > div.imageCaption___cMU2J > span"
            ).get_text()
            str_date = str_date.strip()
            post_date = ChoTotCrawler.convert_str2date(str_date)

        except Exception as e:
            self.__failed_urls.append(self.__current_url)
            traceback.print_exc()
        return post_date

    def visit(self, current_url) -> tuple:
        local_urls = []
        post_date = None
        page_source, page_soup = self.get_html_and_soup_from_url(current_url)

        if page_soup:

            is_post = re.search(self.regex_post, current_url)
            if is_post:
                print("Is a post")
                post_date = self.get_date(page_soup)
                if not self.post_date_range or \
                    (isinstance(post_date, date) and (self.post_date_range["from"] <= post_date <= self.post_date_range["to"])):
                    post_date = post_date.strftime('%d/%m/%Y')
                else:
                    page_source = None

            else:
                page_source = None

            list_href = page_soup.find_all('a')

            for link in list_href:
                anchor = str(link.get('href'))
                if not bool(urlparse(anchor).netloc):
                    anchor = urljoin(self.BASE_URL, anchor)

                if validators.url(anchor) and self.check_type(anchor) and (
                        self.regex_post.search(anchor)
                        or self.regex_sub_url.search(anchor)):
                    local_urls.append(anchor)

        print("<html>" if page_source else "None")
        return page_source, post_date, local_urls

    def obtain_data(self):

        print("START...")
        num_visited = 0
        local_urls, visited_post = self.load_init_url()
        post_count = len(self.__saved_post)
        while local_urls:
            self.__current_url = local_urls.pop(0)

            if len(self.__current_url) < 10 and (
                    self.__current_url in visited_post
                    or not self.check_type(self.__current_url)):
                continue

            print(" > ", self.__current_url)

            page_source, post_date, new_urls_to_visit = self.visit(
                self.__current_url)

            visited_post.append(self.__current_url)
            local_urls += new_urls_to_visit

            if page_source:
                post_count += 1
                self.append_data(_url=self.__current_url,
                                 _type="post",
                                 _status="0",
                                 _html=page_source,
                                 _crawl_date=str(
                                     date.today().strftime("%d/%m/%Y")),
                                 _post_date=post_date)

            # check-point to save buffer data
            if num_visited % self.SAVE_CHECK_POINT == 0:
                self.save_data()
                self.update_crawling_status_info(post_count,
                                                 len(self.__failed_urls))
                self.update_crawling_log()

                ChoTotCrawler.save_list(local_urls, self.file_log_new_url)
                ChoTotCrawler.save_list(visited_post,
                                        self.file_log_visited_url)

            num_visited += 1
            print("  >> num: ", post_count)
            if self.limit > 0 and post_count >= self.limit:
                break

        # finishing
        self.save_data()
        self.update_crawling_status_info(post_count, len(self.__failed_urls))
        self.update_crawling_log()
        self.browser.close()
        print('CRAWLING DONE')

    def rotate_ip(self, enable=False):
        self.browser.set_rotate_ip(enable)
        return

    def save_data(self):
        self.db_object.insert_html_data(self.buffer, many=True)
        # clear buffer
        self.buffer = []

    def get_seed_url(post_type):
        data = {
            "apartment":
            ["https://nha.chotot.com/toan-quoc/mua-ban-can-ho-chung-cu"],
            "house": ["https://nha.chotot.com/toan-quoc/mua-ban-nha-dat"],
            "land": ["https://nha.chotot.com/toan-quoc/mua-ban-dat"]
        }
        return data[post_type] if post_type in data else [
            url for e in data for url in data[e]
        ]

    def save_list(data: list, file_name):
        print("Checkpoint: ", file_name)
        with open(file_name, 'w') as file:
            file.write("\n".join(set(data)))
            file.close()
예제 #7
0
    if center:
        if len(center) == 1:
            s = 0
            c = 0
            seq.append(center[0])
            if len(seq) == 5:
                movimento = open_cv_wrapper.check_movement(seq, frame)
                print(movimento)
                browser.movement_detection(movimento)
                seq.pop(0)
        else:
            s += 1
            if s >= 5:
                is_browser_active = browser.is_active()
                if is_browser_active is not None and is_browser_active:
                    browser.close()
                    s = 0
                    continue
    else:
        c += 1
        if c == 3:
            seq = []
            s = 0

    t2 = time.time()
    if t2 - t1 >= 1:
        fps = f
        f = 0
        t1 = time.time()
    else:
        f += 1
예제 #8
0
class Upload:
    def __init__(self, user):
        self.bot = None
        self.lang = "en"
        self.url = f"https://www.tiktok.com/upload?lang={self.lang}"
        self.cookies = None
        self.userRequest = {"dir": "", "cap": "", "vidTxt": ""}
        self.video = None
        self.IO = IO("hashtags.txt", "schedule.csv")
        self.videoFormats = ["mov", "flv", "avi"]
        self.userPreference = user

    # Class used to upload video.
    def uploadVideo(self,
                    video_dir,
                    videoText,
                    startTime=0,
                    endTime=0,
                    private=True,
                    test=False,
                    scheduled=False,
                    schdate="",
                    schtime=""):

        video_dir = self.downloadIfYoutubeURL(video_dir)
        if not video_dir:
            return

        if self.bot is None:
            self.bot = Browser().getBot()
            self.webbot = Bot(self.bot)

        self.userRequest["dir"] = video_dir
        self.checkFileExtensionValid()
        self.userRequest["cap"] = self.IO.getHashTagsFromFile()
        # Initiate bot if isn't already.
        self.bot.get(self.url)
        self.userRequest["vidTxt"] = videoText

        # Cookies loaded here.
        self.cookies = Cookies(self.bot)
        self.bot.refresh()

        # User now has logged on and can upload videos
        time.sleep(3)
        self.inputVideo(startTime, endTime)
        self.addCaptions()
        utils.randomTimeQuery()
        if private:
            self.webbot.selectPrivateRadio()  # private video selection
        else:
            self.webbot.selectPublicRadio()  # public video selection
        utils.randomTimeQuery()
        if not test:
            self.webbot.uploadButtonClick()  # upload button
        input("Press any button to exit")

    def createVideo(self, video_dir, videoText, startTime=0, endTime=0):
        video_dir = self.downloadIfYoutubeURL(video_dir)
        if not video_dir:
            return
        self.inputVideo(startTime, endTime)
        self.addCaptions()
        print(f"Video has been created: {self.dir}")

    # Method to check file is valid.
    def checkFileExtensionValid(self):
        if self.userRequest["dir"].endswith('.mp4'):
            pass
        else:
            self.bot.close()
            exit(f"File: {self.userRequest['dir']} has wrong file extension.")

    # This gets the hashtags from file and adds them to the website input
    def addCaptions(self, hashtag_file=None):
        if not hashtag_file:
            caption_elem = self.webbot.getCaptionElem()
            for hashtag in self.IO.getHashTagsFromFile():
                caption_elem.send_keys(hashtag)

    def clearCaptions(self):
        caption_elem = self.webbot.getCaptionElem()
        caption_elem.send_keys("")

    def inputScheduler(self, schdate, schtime):
        # In charge of selecting scheduler in the input.
        utils.randomTimeQuery()
        self.webbot.selectScheduleToggle()

    # This is in charge of adding the video into tiktok input element.
    def inputVideo(self, startTime=0, endTime=0):
        try:
            file_input_element = self.webbot.getVideoUploadInput()
        except Exception as e:
            print(
                "Major error, cannot find the upload button, please update getVideoUploadInput() in Bot.py"
            )
            print(f"Actual Error: {e}")
            file_input_element = ""
            exit()
        # Check if file has correct .mp4 extension, else throw error.
        self.video = Video(self.userRequest["dir"], self.userRequest["vidTxt"],
                           self.userPreference)
        print(f"startTime: {startTime}, endTime: {endTime}")
        if startTime != 0 and endTime != 0 or endTime != 0:
            print(f"Cropping Video timestamps: {startTime}, {endTime}")
            self.video.customCrop(startTime, endTime)
        # Crop first and then make video.

        self.video.createVideo()  # Link to video class method
        while not os.path.exists(self.video.dir):  # Wait for path to exist
            time.sleep(1)
        abs_path = os.path.join(os.getcwd(), self.video.dir)
        file_input_element.send_keys(abs_path)

    def downloadIfYoutubeURL(self, video_dir) -> str:
        """
        Function will determine whether given video directory is a youtube link, returning the downloaded video path
        Else it will just return current path.
        """

        url_variants = [
            "http://youtu.be/", "https://youtu.be/", "http://youtube.com/",
            "https://youtube.com/", "https://m.youtube.com/",
            "http://www.youtube.com/", "https://www.youtube.com/"
        ]
        if any(ext in video_dir for ext in url_variants):
            print("Detected Youtube Video...")
            video_dir = Video.get_youtube_video(self.userPreference, video_dir)
        return video_dir

    def directUpload(self, filename, private=False, test=False):
        if self.bot is None:
            self.bot = Browser().getBot()
            self.webbot = Bot(self.bot)
        self.bot.get(self.url)
        utils.randomTimeQuery()
        self.cookies = Cookies(self.bot)
        self.bot.refresh()

        try:
            file_input_element = self.webbot.getVideoUploadInput()
        except Exception as e:
            print(f"Error: {e}")
            print(
                "Major error, cannot find the file upload button, please update getVideoUploadInput() in Bot.py"
            )
            file_input_element = None
            exit()
        abs_path = os.path.join(os.getcwd(), filename)
        try:
            file_input_element.send_keys(abs_path)
        except StaleElementReferenceException as e:
            try:
                self.bot.implicitly_wait(5)
                file_input_element = self.webbot.getVideoUploadInput()
                file_input_element.send_keys(abs_path)
            except Exception as e:
                print(
                    "Major error, cannot find the file upload button, please update getVideoUploadInput() in Bot.py"
                )
                exit()

        # We need to wait until it is uploaded and then clear input.

        self.addCaptions()
        utils.randomTimeQuery()
        if private:
            self.webbot.selectPrivateRadio()  # private video selection
            utils.randomTimeQuery()
        else:
            """
            self.webbot.selectPublicRadio()  # public video selection
            utils.randomTimeQuery()
            """
            pass
        if not test:

            self.webbot.uploadButtonClick()  # upload button
        input("Press any button to exit")
예제 #9
0
class Reviews:
    def __init__(self, path=None, lang="ar", edition_reviews=False):
        # Language of reviews to be scraped
        self._lang = lang
        # Instantiate browsing and writing managers
        self.wr = Writer(path) if path else Writer()
        self.br = Browser(edition_reviews)
        # Initialize an empty threads list
        self._threads = []
        # Counter for reviews from different languages
        self._invalid = None

    def start(self):
        self.br.start()

    # Scrape and write books' reviews to separate files
    def output_books_reviews(self, books_ids, consider_previous=True):
        if consider_previous:
            # Don't loop through already scraped books
            self.wr.consider_written_files(books_ids)
        # Show how many books are going to be scraped
        print(f"Scraping {len(books_ids)} Books")
        # Loop through book ids in array and scrape books
        for book_id in books_ids:
            self.output_book_reviews(book_id)

    # Scrape and write one book's reviews to a file
    def output_book_reviews(self, book_id):
        self._threads.clear()
        # Open book file and page by its Id
        self.br.open_book_page(book_id)
        self.wr.open_book_file(book_id)
        # Reset invalid reviews counter and page counter
        self._invalid = 0
        # Scrape book meta data in first line
        self.run(self._scrape_book_meta, [book_id])
        # Scrape first page of the book anyway
        self.run(self._scrape_book_reviews)
        no_next_page = False
        try:  # Scrape the remaining pages
            while self._invalid < 60:
                # Go to next page if there's one
                in_next_page = self.br.goto_next_page()
                if no_next_page or not in_next_page:
                    no_next_page = False
                    # Switch to a different reviews mode
                    if not self.br.switch_reviews_mode(book_id, in_next_page is None):
                        # Break after switching to all modes
                        break
                # Wait until requested book reviews are loaded
                if self.br.are_reviews_loaded():
                    # Scrape loaded book reviews
                    self.run(self._scrape_book_reviews)
                else: no_next_page = True
        finally:
            # Wait until all threads are done
            [thread.join() for thread in self._threads]
        # Finalize file name and close it
        self.wr.close_book_file()

    # Scrape and write book and author data
    def _scrape_book_meta(self, html, book_id):
        # Create soup object and store book meta section of the page in soup
        soup = BeautifulSoup(html, "lxml").find(id="metacol")
        # If book is not found
        if not soup:
            print(f"*Book ID:\t{book_id:<15}Not Found!")
            # Close file and raise an error
            self.wr.close_book_file()
            raise FileNotFoundError
        # Get book title and remove spaces from it
        title = soup.find(id="bookTitle").get_text(". ", strip=True)
        # Get average rating of the book out of five
        rating = soup.find(class_="average").get_text()
        # Store author data section
        author = soup.find(class_="authorName")
        # Get author id from url
        id_ = author.get("href")[38:].split(".")[0]
        # Get author name
        name = author.find().get_text()
        # Write scraped meta data to file's first line
        self.wr.write_book_meta(book_id, title, rating, id_, name)
        # Display book id and title
        print(f"*Book ID:\t{book_id:<15}Title:\t{title}")

    # Scrape a single page's reviews
    def _scrape_book_reviews(self, html):
        # Store reviews section of the page in soup
        soup = BeautifulSoup(html, "lxml").find(id="bookReviews")
        # Loop through reviews individually
        for review in soup.find_all(class_="review"):
            try:  # Get user / reviewer id
                user_id = review.find(class_="user").get("href")[11:].split("-")[0]
                # Get rating out of five stars
                stars = len(review.find(class_="staticStars").find_all(class_="p10"))
                # Get full review text even the hidden parts, and remove spaces and newlines
                comment = review.find(class_="readable").find_all("span")[-1].get_text(". ", strip=True)
                # Detect which language the review is in
                if detect(comment) != self._lang:
                    # Count it as a different language review
                    self._invalid += 1
                    continue
                # Get review date
                date = review.find(class_="reviewDate").get_text()
            # Skip the rest if one of the above is missing
            except Exception:
                # Count it as an invalid review
                self._invalid += 2
                continue
            # If it's not a strike, reset the counter
            self._invalid = 0
            # Get review ID
            review_id = review.get("id")[7:]
            # Write the scraped review to the file
            self.wr.write_review(review_id, user_id, date, stars, comment)
            # Add review id to ids
            print(f"Added ID:\t{review_id}")
        return True

    # Starts a scraping process on a new thread
    def run(self, method, args=[]):
        # Create a thread and add it to threads list then start it
        self._threads.append(SafeThread(target=method, args=[self.br.page_source] + args))
        self._threads[-1].start()

    def reset(self):
        self.stop()
        self.start()
        print("Restarted Reviews")

    def stop(self):
        self.br.close()
        self.wr.delete_file()

    def close(self):
        self.br.quit()
        self.wr.close()
        self._threads.clear()
        print("Closed Reviews")