def _get(self, url, sub_dir): capture = Capture(sub_dir, self.conf) browser = Browser(url, sub_dir, self.conf) links = [] try: capture.run() sleep(3) html = browser.get() links = self._get_links(html) sleep(30) except WebDriverException: self._create_exception_file(traceback.format_exc(), sub_dir) except KeyboardInterrupt: self._create_exception_file(traceback.format_exc(), sub_dir) finally: capture.kill() browser.close() return links
def setup(request, setUpClass): print("initiating chrome driverd") driver = Browser().getbrowser("chrome") url = URL() driver.get(url.webViewerUAT) utility = Utility() # utility.createLogFolder() log = open(utility.logpath + "/WV-00.txt", "a+") driverUtility = DriverUtility(driver, log) loginPageObject = LoginPageObject(driverUtility, log) request.cls.driver = driver request.cls.url1 = url request.cls.utility = utility request.cls.driverUtility = driverUtility request.cls.loginPageObject = loginPageObject print("setup ended") yield driver driver.close()
class WV00(unittest.TestCase): # @pytest.fixture(scope="module") def setUpClass(self): self.utility = Utility() # CHANGE THE LOG FILE NAME IN THE NEXT LINE****************************************************************************************** self.log = open(self.utility.logpath + "/WV-00.txt", "a+") self.suite_start_time = time.time() self.log.write("Suite started at {}\n".format( str(time.ctime(int(self.suite_start_time))))) self.url = URL() self.loginPageStaticTexts = LoginPageStaticText() self.loginPageTestData = LoginPageTestData() self.configTestCase = configparser.RawConfigParser() # CHANGE THE CONFIG PROPERTY FILE NAME IN THE NEXT LINE****************************************************************************************** self.configTestCase.read( os.path.dirname(os.getcwd()) + '/TestCases/WV_00_Config.properties') self.configECG = configparser.RawConfigParser() self.configECG.read( os.path.dirname(os.getcwd()) + '/Scripts/ECGRelatedData.properties') self.configDevice = configparser.RawConfigParser() self.configDevice.read( os.path.dirname(os.getcwd()) + '/Scripts/DeviceRelatedData.properties') self.sendECG = SendECG() yield self.suite_end_time = time.time() self.total_time_taken_suite = self.suite_end_time - self.suite_start_time self.log.write("Suite ended at {}\n".format( str(time.ctime(int(self.suite_end_time))))) self.log.write( "Total time taken by Test Suite to finish: {} seconds\n".format( self.total_time_taken_suite)) self.log.close() @pytest.fixture() def setUp(self): self.driver = Browser().getbrowser("chrome") self.driver.get(self.url.webViewerUAT) self.driverUtility = DriverUtility(self.driver, self.log) self.loginPageObject = LoginPageObject(self.driverUtility, self.log) yield self.driver.close() # WV-00-000 # Description : # Procedure/Protocol : # - Generate a test ECG # Acceptance Criteria : def test_WV00(self, setUpClass, setUp): testCaseID = "WV-00-000" if self.configTestCase.get(testCaseID, 'Enabled') == "Yes": startTime = self.driverUtility.startTestCase(testCaseID) try: # code here for test cases without ECGs******************************************************************************** self.driverUtility.passedTestCase(testCaseID, startTime) except AssertionError as error: self.driverUtility.failedTestCase(testCaseID) except Exception as e: self.driverUtility.erroredTestCase(testCaseID) else: self.log.write("Test case {} not enabled\n".format(testCaseID)) self.log.write( "********************************************************************************\n" )
class NhaDat247(CrawlerObject): BASE_URL = "https://nhadat247.com.vn/" SAVE_CHECK_POINT = 5 def __init__(self, date_from=None, date_to=None, post_type=None, all_date: bool = False, resume=False, limit=-1): self.limit = int(limit) self.db_object = DBObject() the_status = "crawling" worker_info = self.db_object.query_wokers_info(Settings.worker_id) self.resume = resume if self.resume: try: info_ = worker_info status_ = info_["status"] task_id = info_["task_id"] info_str_ = info_["str_info"] if not ("(pause)" in status_ and "crawling" in status_): print(">>", status_) return info_dict_ = { _i_.split(": ")[0]: _i_.split(": ")[1] for _i_ in info_str_.lower().split(", ") } if info_dict_["site"] != "nhadat247.com.vn": return date_from = info_dict_["date"].split("-")[0] date_to = info_dict_["date"].split("-")[1] try: self.limit = int(info_dict_["limit"]) except: self.limit = -1 post_type = info_dict_["type"] the_status = status_.replace("(pause)", "") print("Internal loading data to resume") except: traceback.print_exc() return self.__str_info = "Site: nhadat247.com.vn, Type: %s, Date: %s-%s, Limit: %s, " % ( post_type, date_from, date_to, str(self.limit) if isinstance(self.limit, int) and self.limit > 0 else "No") self.__str_info += "Numpost: %d, Error: %d" self.post_type = post_type self.buffer = [] self.seed_url = NhaDat247.get_seed_url(post_type) self.__current_url = "" self.__failed_urls = [] self.__saved_post = [] self.file_log_visited_url = "visited_post_log_nhadat247_%s.txt" % ( self.post_type) self.file_log_new_url = "local_urls_log_nhadat247_%s.txt" % ( self.post_type) self.regex_sub_url = re.compile( "([a-z][-a-z]*)?ban-[-a-z]+((.html)|(/[0-9]+))?") self.regex_post = re.compile( "([a-z][-a-z]*)?ban-[-a-z0-9]+/[-a-z0-9]+pr[0-9]+.html") self.key_type = NhaDat247.get_key_from_type(self.post_type) try: last_day_to = calendar.monthrange(int(date_to.split("/")[1]), int(date_to.split("/")[0]))[1] self.post_date_range = { "from": datetime.strptime("1/" + date_from, '%d/%m/%Y').date(), "to": datetime.strptime( str(last_day_to) + "/" + date_to, '%d/%m/%Y').date() } print("-" * 200, "\n", self.post_date_range) except: traceback.print_exc() self.post_date_range = None self.browser = Browser(headless=False) if not self.resume: task_id = (int)(time.time()) self.__crawling_info = { "task_id": task_id, "status": the_status, "str_info": "" } self.__crawling_log = { "worker_id": Settings.worker_id, "task_id": task_id, "task_info": self.__str_info % (0, 0), "saved_posts": [], "error_posts": [] } if not self.resume: print("Create log") self.db_object.create_wokers_log(self.__crawling_log) self.update_crawling_status_info(0, 0) else: log = self.db_object.query_wokers_logs(Settings.worker_id, task_id) print("Get log: ", log if log else "null") if log is not None: self.__saved_post = log["saved_posts"] self.__failed_urls = log["error_posts"] print("Init crawler") def update_crawling_status_info(self, num_post, num_error): self.__crawling_info["str_info"] = self.__str_info % (num_post, num_error) self.db_object.update_wokers_info(Settings.worker_id, self.__crawling_info) def update_crawling_log(self): self.db_object.update_wokers_log(Settings.worker_id, self.__crawling_log["task_id"], self.__saved_post, self.__failed_urls) def get_html_and_soup_from_url(self, url): """ Return Beautifulsoup object """ _soup = None _html = None for i in range(5): try: element_present = EC.presence_of_element_located( (By.CSS_SELECTOR, "body > div.footer")) _html = self.browser.get_html(url, until_ec=element_present) _soup = BeautifulSoup(_html, 'html.parser') if _soup is not None: return _html, _soup except Exception as e: traceback.print_exc() continue self.__failed_urls.append(self.__current_url) return None, None @staticmethod def get_key_from_type(key) -> list: if key == "land": return ["ban-dat"] elif key == "apartment": return ["ban-can-ho-chung-cu"] elif key == "house": return ["ban-nha-mat-pho", "ban-nha-biet-thu", "ban-nha-rieng"] return [ "ban-dat", "ban-can-ho-chung-cu", "ban-nha-rieng", "ban-nha-mat-pho", "ban-nha-biet-thu" ] def check_type(self, url) -> bool: for key in self.key_type: if key in url: # print("ok") return True return False def append_data(self, _url, _type, _status, _crawl_date, _post_date, _html): post = {} url_hash = hashlib.md5(_url.encode()).hexdigest() post["url_hash"] = url_hash post["url"] = _url post["type"] = _type post["status"] = _status post["html"] = _html post["date"] = _crawl_date post["post_date"] = _post_date self.__saved_post.append(url_hash) self.buffer.append(post) # post["html"] = "<html>" # print("-"*10,"\n",post) def load_init_url(self) -> tuple: local_urls = self.seed_url visited_post = [] if self.resume: try: local_urls = list(open(self.file_log_new_url, "r").readlines()) except: "" try: visited_post = list( open(self.file_log_visited_url, "r").readlines()) except: "" return local_urls, visited_post def get_date(self, page_soup: BeautifulSoup) -> date: post_date = None try: str_date = page_soup.select_one( "#ContentPlaceHolder1_ProductDetail1_divprice > div").get_text( ).split("|")[1] str_date = slugify(str_date.strip().lower()) if "hom-kia" in str_date: post_date = date.today() - timedelta(days=2) elif "hom-qua" in str_date: post_date = date.today() - timedelta(days=1) elif "hom-nay" in str_date: post_date = date.today() else: post_date = datetime.strptime(str_date, '%d-%m-%Y').date() except Exception as e: self.__failed_urls.append(self.__current_url) traceback.print_exc() return post_date def visit(self, current_url) -> tuple: local_urls = [] post_date = None page_source, page_soup = self.get_html_and_soup_from_url(current_url) if page_soup: is_post = re.search(self.regex_post, current_url) if is_post: print("Is a post") post_date = self.get_date(page_soup) if not self.post_date_range or \ (isinstance(post_date, date) and (self.post_date_range["from"] <= post_date <= self.post_date_range["to"])): post_date = post_date.strftime('%d/%m/%Y') else: page_source = None else: page_source = None list_href = page_soup.find_all('a') for link in list_href: anchor = str(link.get('href')) if not bool(urlparse(anchor).netloc): anchor = urljoin(self.BASE_URL, anchor) if validators.url(anchor) and self.check_type(anchor) and ( self.regex_post.search(anchor) or self.regex_sub_url.search(anchor)): local_urls.append(anchor) print("<html>" if page_source else "None") return page_source, post_date, local_urls def obtain_data(self): print("START...") num_visited = 0 local_urls, visited_post = self.load_init_url() post_count = len(self.__saved_post) while local_urls: self.__current_url = local_urls.pop(0) if len(self.__current_url) < 10 and ( self.__current_url in visited_post or not self.check_type(self.__current_url)): continue print(" > ", self.__current_url) page_source, post_date, new_urls_to_visit = self.visit( self.__current_url) visited_post.append(self.__current_url) local_urls += new_urls_to_visit if page_source: post_count += 1 self.append_data(_url=self.__current_url, _type="post", _status="0", _html=page_source, _crawl_date=str( date.today().strftime("%d/%m/%Y")), _post_date=post_date) # check-point to save buffer data if num_visited % self.SAVE_CHECK_POINT == 0: self.save_data() self.update_crawling_status_info(post_count, len(self.__failed_urls)) self.update_crawling_log() NhaDat247.save_list(local_urls, self.file_log_new_url) NhaDat247.save_list(visited_post, self.file_log_visited_url) num_visited += 1 print(" >> num: ", post_count) if self.limit > 0 and post_count >= self.limit: break # finishing self.save_data() self.update_crawling_status_info(post_count, len(self.__failed_urls)) self.update_crawling_log() self.browser.close() print('CRAWLING DONE') def rotate_ip(self, enable=False): self.browser.set_rotate_ip(enable) return def save_data(self): self.db_object.insert_html_data(self.buffer, many=True) # clear buffer self.buffer = [] def get_seed_url(post_type): data = { "apartment": ["https://nhadat247.com.vn/ban-can-ho-chung-cu.html"], "house": [ "https://nhadat247.com.vn/ban-nha-rieng.html", "https://nhadat247.com.vn/ban-nha-biet-thu-lien-ke.html", "https://nhadat247.com.vn/ban-nha-mat-pho.html" ], "land": [ "https://nhadat247.com.vn/ban-dat-nen-du-an.html", "https://nhadat247.com.vn/ban-dat.html" ] } return data[post_type] if post_type in data else [ url for e in data for url in data[e] ] def save_list(data: list, file_name): print("Checkpoint: ", file_name) with open(file_name, 'w') as file: file.write("\n".join(set(data))) file.close()
def setup(request, setUpClass): print("initiating chrome driverd") driver = Browser().getbrowser("chrome") url = URL() driver.get(url.webViewerUAT) utility = Utility() # utility.createLogFolder() log = open(utility.logpath + "/WV-00.txt", "a+") driverUtility = DriverUtility(driver, log) loginPageObject = LoginPageObject(driverUtility, log) request.cls.driver = driver request.cls.url1 = url request.cls.utility = utility request.cls.driverUtility = driverUtility request.cls.loginPageObject = loginPageObject print("setup ended") yield driver driver.close() # from datetime import datetime # def pytest_logger_config(logger_config): # logger_config.add_loggers(['foo', 'bar', 'baz'], stdout_level='debug') # logger_config.set_log_option_default('foo,bar') # def pytest_logger_logdirlink(config): # print("1") # path = os.path.dirname(os.getcwd()) + '/Logs/' # foldername = datetime.now().strftime("%Y%m%d-%H%M%S") # logpath = path+foldername # try: # # return os.mkdir(logpath) # return os.path.join(path, foldername) # # return logpath # except OSError as e: # print("Creation of the directory failed") # print(traceback.format_exc()) # else: # print("Successfully created the directory") # return os.path.join(os.path.dirname(__file__), 'mylogs') # @pytest.yield_fixture(scope='session') # def session_thing(): # foo.debug('constructing session thing') # yield # foo.debug('destroying session thing') # @pytest.yield_fixture # def testcase_thing(): # foo.debug('constructing testcase thing') # yield # foo.debug('destroying testcase thing') # @pytest.fixture(scope="class") # def setup(request): # print("initiating chrome driver") # driver = Browser().getbrowser("chrome") #if not added in PATH # url = URL() # utility = Utility() # # driver.maximize_window() # request.cls.d = driver # request.cls.u = utility # request.cls.url1 = url # yield # driver.close() # import pytest # from selenium import webdriver # @pytest.fixture(scope="class") # def setup(request): # print("initiating chrome driver") # driver = Browser().getbrowser("chrome") #if not added in PATH # url = URL() # utility = Utility() # # driver.maximize_window() # request.cls.d = driver # request.cls.u = utility # request.cls.url1 = url # yield driver # driver.close() # @pytest.fixture(scope='session') # def config(): # with open('WV_00_Config.json') as config_file: # data = json.load(config_file) # for r in data['Enabled']: # print (r[b]) # return data
class ChoTotCrawler(CrawlerObject): BASE_URL = "https://nha.chotot.com/" SAVE_CHECK_POINT = 5 def __init__(self, date_from=None, date_to=None, post_type=None, all_date: bool = False, resume=False, limit=-1): self.limit = int(limit) self.db_object = DBObject() the_status = "crawling" worker_info = self.db_object.query_wokers_info(Settings.worker_id) self.resume = resume if self.resume: try: info_ = worker_info status_ = info_["status"] task_id = info_["task_id"] info_str_ = info_["str_info"] if not ("(pause)" in status_ and "crawling" in status_): print(">>", status_) return info_dict_ = { _i_.split(": ")[0]: _i_.split(": ")[1] for _i_ in info_str_.lower().split(", ") } if info_dict_["site"] != "nha.chotot.com": return date_from = info_dict_["date"].split("-")[0] date_to = info_dict_["date"].split("-")[1] try: self.limit = int(info_dict_["limit"]) except: self.limit = -1 post_type = info_dict_["type"] the_status = status_.replace("(pause)", "") print("Internal loading data to resume") except: traceback.print_exc() return self.__str_info = "Site: nha.chotot.com, Type: %s, Date: %s-%s, Limit: %s, " % ( post_type, date_from, date_to, str(self.limit) if isinstance(self.limit, int) and self.limit > 0 else "No") self.__str_info += "Numpost: %d, Error: %d" self.post_type = post_type self.buffer = [] self.seed_url = ChoTotCrawler.get_seed_url(post_type) self.__current_url = "" self.__failed_urls = [] self.__saved_post = [] self.file_log_visited_url = "visited_post_log_chotot_%s.txt" % ( self.post_type) self.file_log_new_url = "local_urls_log_chotot_%s.txt" % ( self.post_type) self.regex_sub_url = re.compile( "([a-z][-a-z]*)?ban-[-a-z]+((.htm)|(/[0-9]+))?") self.regex_post = re.compile( "([a-z][-a-z]+)?[/][a-z][-a-z0-9]+/[-a-z0-9]+.htm") self.key_type = ChoTotCrawler.get_key_from_type(self.post_type) try: last_day_to = calendar.monthrange(int(date_to.split("/")[1]), int(date_to.split("/")[0]))[1] self.post_date_range = { "from": datetime.strptime("1/" + date_from, '%d/%m/%Y').date(), "to": datetime.strptime( str(last_day_to) + "/" + date_to, '%d/%m/%Y').date() } print("-" * 200, "\n", self.post_date_range) except: traceback.print_exc() self.post_date_range = None self.browser = Browser(headless=False) if not self.resume: task_id = (int)(time.time()) self.__crawling_info = { "task_id": task_id, "status": the_status, "str_info": "" } self.__crawling_log = { "worker_id": Settings.worker_id, "task_id": task_id, "task_info": self.__str_info % (0, 0), "saved_posts": [], "error_posts": [] } if not self.resume: print("Create log") self.db_object.create_wokers_log(self.__crawling_log) self.update_crawling_status_info(0, 0) else: log = self.db_object.query_wokers_logs(Settings.worker_id, task_id) print("Get log: ", log if log else "null") if log is not None: self.__saved_post = log["saved_posts"] self.__failed_urls = log["error_posts"] print("Init crawler") def update_crawling_status_info(self, num_post, num_error): self.__crawling_info["str_info"] = self.__str_info % (num_post, num_error) self.db_object.update_wokers_info(Settings.worker_id, self.__crawling_info) def update_crawling_log(self): self.db_object.update_wokers_log(Settings.worker_id, self.__crawling_log["task_id"], self.__saved_post, self.__failed_urls) def get_html_and_soup_from_url(self, url): """ Return Beautifulsoup object """ _soup = None _html = None click_phone_script = """ function getElementByXpath(path) { return document.evaluate(path, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue; } var phone = getElementByXpath("//*[@id='__next']/div[3]/div[1]/div/div[4]/div[3]/div/linkcontact"); if (phone != null) { phone.click(); } """ for i in range(5): try: is_post = re.search(self.regex_post, url) element_present = EC.presence_of_element_located( (By.XPATH, """//html/body/div[1]/footer""")) _html = self.browser.get_html( url=url, until_ec=element_present, run_script=click_phone_script if is_post else None) _soup = BeautifulSoup(_html, 'html.parser') if _soup is not None: return _html, _soup except Exception as e: traceback.print_exc() continue self.__failed_urls.append(self.__current_url) return None, None @staticmethod def get_key_from_type(key) -> list: if key == "land": return ["mua-ban-dat"] elif key == "apartment": return ["mua-ban-can-ho-chung-cu"] elif key == "house": return ["mua-ban-nha-dat"] return ["mua-ban-dat", "mua-ban-nha-dat", "mua-ban-can-ho-chung-cu"] def check_type(self, url) -> bool: for key in self.key_type: if key in url: # print("ok") return True return False def append_data(self, _url, _type, _status, _crawl_date, _post_date, _html): post = {} url_hash = hashlib.md5(_url.encode()).hexdigest() post["url_hash"] = url_hash post["url"] = _url post["type"] = _type post["status"] = _status post["html"] = _html post["date"] = _crawl_date post["post_date"] = _post_date self.__saved_post.append(url_hash) self.buffer.append(post) # post["html"] = "<html>" # print("-"*10,"\n",post) def load_init_url(self) -> tuple: local_urls = self.seed_url visited_post = [] if self.resume: try: local_urls = list(open(self.file_log_new_url, "r").readlines()) except: "" try: visited_post = list( open(self.file_log_visited_url, "r").readlines()) except: "" return local_urls, visited_post def convert_str2date(date_str): _date = None date_str = slugify(date_str.lower()) _l = date_str.split("-") if "hom-qua" in date_str: _date = date.today() - timedelta(days=1) elif "thang" in _l: _n = int(_l[_l.index("thang") - 1][0]) _date = date.today() - timedelta(days=30 * _n) elif "tuan" in _l: _n = int(_l[_l.index("tuan") - 1][0]) _date = date.today() - timedelta(days=7 * _n) elif "ngay" in _l: _n = int(_l[_l.index("ngay") - 1][0]) _date = date.today() - timedelta(days=1) elif "hom-nay" in date_str or "gio" in _l or "phut" in _l: _date = date.today() else: _date = datetime.strptime(date_str, '%d/%m/%Y').date() return _date def get_date(self, page_soup: BeautifulSoup) -> date: post_date = None try: str_date = page_soup.select_one( "#__next > div > div.ct-detail.adview > div > div.col-md-8 > div.adImageWrapper___KTd-h > div.imageCaption___cMU2J > span" ).get_text() str_date = str_date.strip() post_date = ChoTotCrawler.convert_str2date(str_date) except Exception as e: self.__failed_urls.append(self.__current_url) traceback.print_exc() return post_date def visit(self, current_url) -> tuple: local_urls = [] post_date = None page_source, page_soup = self.get_html_and_soup_from_url(current_url) if page_soup: is_post = re.search(self.regex_post, current_url) if is_post: print("Is a post") post_date = self.get_date(page_soup) if not self.post_date_range or \ (isinstance(post_date, date) and (self.post_date_range["from"] <= post_date <= self.post_date_range["to"])): post_date = post_date.strftime('%d/%m/%Y') else: page_source = None else: page_source = None list_href = page_soup.find_all('a') for link in list_href: anchor = str(link.get('href')) if not bool(urlparse(anchor).netloc): anchor = urljoin(self.BASE_URL, anchor) if validators.url(anchor) and self.check_type(anchor) and ( self.regex_post.search(anchor) or self.regex_sub_url.search(anchor)): local_urls.append(anchor) print("<html>" if page_source else "None") return page_source, post_date, local_urls def obtain_data(self): print("START...") num_visited = 0 local_urls, visited_post = self.load_init_url() post_count = len(self.__saved_post) while local_urls: self.__current_url = local_urls.pop(0) if len(self.__current_url) < 10 and ( self.__current_url in visited_post or not self.check_type(self.__current_url)): continue print(" > ", self.__current_url) page_source, post_date, new_urls_to_visit = self.visit( self.__current_url) visited_post.append(self.__current_url) local_urls += new_urls_to_visit if page_source: post_count += 1 self.append_data(_url=self.__current_url, _type="post", _status="0", _html=page_source, _crawl_date=str( date.today().strftime("%d/%m/%Y")), _post_date=post_date) # check-point to save buffer data if num_visited % self.SAVE_CHECK_POINT == 0: self.save_data() self.update_crawling_status_info(post_count, len(self.__failed_urls)) self.update_crawling_log() ChoTotCrawler.save_list(local_urls, self.file_log_new_url) ChoTotCrawler.save_list(visited_post, self.file_log_visited_url) num_visited += 1 print(" >> num: ", post_count) if self.limit > 0 and post_count >= self.limit: break # finishing self.save_data() self.update_crawling_status_info(post_count, len(self.__failed_urls)) self.update_crawling_log() self.browser.close() print('CRAWLING DONE') def rotate_ip(self, enable=False): self.browser.set_rotate_ip(enable) return def save_data(self): self.db_object.insert_html_data(self.buffer, many=True) # clear buffer self.buffer = [] def get_seed_url(post_type): data = { "apartment": ["https://nha.chotot.com/toan-quoc/mua-ban-can-ho-chung-cu"], "house": ["https://nha.chotot.com/toan-quoc/mua-ban-nha-dat"], "land": ["https://nha.chotot.com/toan-quoc/mua-ban-dat"] } return data[post_type] if post_type in data else [ url for e in data for url in data[e] ] def save_list(data: list, file_name): print("Checkpoint: ", file_name) with open(file_name, 'w') as file: file.write("\n".join(set(data))) file.close()
if center: if len(center) == 1: s = 0 c = 0 seq.append(center[0]) if len(seq) == 5: movimento = open_cv_wrapper.check_movement(seq, frame) print(movimento) browser.movement_detection(movimento) seq.pop(0) else: s += 1 if s >= 5: is_browser_active = browser.is_active() if is_browser_active is not None and is_browser_active: browser.close() s = 0 continue else: c += 1 if c == 3: seq = [] s = 0 t2 = time.time() if t2 - t1 >= 1: fps = f f = 0 t1 = time.time() else: f += 1
class Upload: def __init__(self, user): self.bot = None self.lang = "en" self.url = f"https://www.tiktok.com/upload?lang={self.lang}" self.cookies = None self.userRequest = {"dir": "", "cap": "", "vidTxt": ""} self.video = None self.IO = IO("hashtags.txt", "schedule.csv") self.videoFormats = ["mov", "flv", "avi"] self.userPreference = user # Class used to upload video. def uploadVideo(self, video_dir, videoText, startTime=0, endTime=0, private=True, test=False, scheduled=False, schdate="", schtime=""): video_dir = self.downloadIfYoutubeURL(video_dir) if not video_dir: return if self.bot is None: self.bot = Browser().getBot() self.webbot = Bot(self.bot) self.userRequest["dir"] = video_dir self.checkFileExtensionValid() self.userRequest["cap"] = self.IO.getHashTagsFromFile() # Initiate bot if isn't already. self.bot.get(self.url) self.userRequest["vidTxt"] = videoText # Cookies loaded here. self.cookies = Cookies(self.bot) self.bot.refresh() # User now has logged on and can upload videos time.sleep(3) self.inputVideo(startTime, endTime) self.addCaptions() utils.randomTimeQuery() if private: self.webbot.selectPrivateRadio() # private video selection else: self.webbot.selectPublicRadio() # public video selection utils.randomTimeQuery() if not test: self.webbot.uploadButtonClick() # upload button input("Press any button to exit") def createVideo(self, video_dir, videoText, startTime=0, endTime=0): video_dir = self.downloadIfYoutubeURL(video_dir) if not video_dir: return self.inputVideo(startTime, endTime) self.addCaptions() print(f"Video has been created: {self.dir}") # Method to check file is valid. def checkFileExtensionValid(self): if self.userRequest["dir"].endswith('.mp4'): pass else: self.bot.close() exit(f"File: {self.userRequest['dir']} has wrong file extension.") # This gets the hashtags from file and adds them to the website input def addCaptions(self, hashtag_file=None): if not hashtag_file: caption_elem = self.webbot.getCaptionElem() for hashtag in self.IO.getHashTagsFromFile(): caption_elem.send_keys(hashtag) def clearCaptions(self): caption_elem = self.webbot.getCaptionElem() caption_elem.send_keys("") def inputScheduler(self, schdate, schtime): # In charge of selecting scheduler in the input. utils.randomTimeQuery() self.webbot.selectScheduleToggle() # This is in charge of adding the video into tiktok input element. def inputVideo(self, startTime=0, endTime=0): try: file_input_element = self.webbot.getVideoUploadInput() except Exception as e: print( "Major error, cannot find the upload button, please update getVideoUploadInput() in Bot.py" ) print(f"Actual Error: {e}") file_input_element = "" exit() # Check if file has correct .mp4 extension, else throw error. self.video = Video(self.userRequest["dir"], self.userRequest["vidTxt"], self.userPreference) print(f"startTime: {startTime}, endTime: {endTime}") if startTime != 0 and endTime != 0 or endTime != 0: print(f"Cropping Video timestamps: {startTime}, {endTime}") self.video.customCrop(startTime, endTime) # Crop first and then make video. self.video.createVideo() # Link to video class method while not os.path.exists(self.video.dir): # Wait for path to exist time.sleep(1) abs_path = os.path.join(os.getcwd(), self.video.dir) file_input_element.send_keys(abs_path) def downloadIfYoutubeURL(self, video_dir) -> str: """ Function will determine whether given video directory is a youtube link, returning the downloaded video path Else it will just return current path. """ url_variants = [ "http://youtu.be/", "https://youtu.be/", "http://youtube.com/", "https://youtube.com/", "https://m.youtube.com/", "http://www.youtube.com/", "https://www.youtube.com/" ] if any(ext in video_dir for ext in url_variants): print("Detected Youtube Video...") video_dir = Video.get_youtube_video(self.userPreference, video_dir) return video_dir def directUpload(self, filename, private=False, test=False): if self.bot is None: self.bot = Browser().getBot() self.webbot = Bot(self.bot) self.bot.get(self.url) utils.randomTimeQuery() self.cookies = Cookies(self.bot) self.bot.refresh() try: file_input_element = self.webbot.getVideoUploadInput() except Exception as e: print(f"Error: {e}") print( "Major error, cannot find the file upload button, please update getVideoUploadInput() in Bot.py" ) file_input_element = None exit() abs_path = os.path.join(os.getcwd(), filename) try: file_input_element.send_keys(abs_path) except StaleElementReferenceException as e: try: self.bot.implicitly_wait(5) file_input_element = self.webbot.getVideoUploadInput() file_input_element.send_keys(abs_path) except Exception as e: print( "Major error, cannot find the file upload button, please update getVideoUploadInput() in Bot.py" ) exit() # We need to wait until it is uploaded and then clear input. self.addCaptions() utils.randomTimeQuery() if private: self.webbot.selectPrivateRadio() # private video selection utils.randomTimeQuery() else: """ self.webbot.selectPublicRadio() # public video selection utils.randomTimeQuery() """ pass if not test: self.webbot.uploadButtonClick() # upload button input("Press any button to exit")
class Reviews: def __init__(self, path=None, lang="ar", edition_reviews=False): # Language of reviews to be scraped self._lang = lang # Instantiate browsing and writing managers self.wr = Writer(path) if path else Writer() self.br = Browser(edition_reviews) # Initialize an empty threads list self._threads = [] # Counter for reviews from different languages self._invalid = None def start(self): self.br.start() # Scrape and write books' reviews to separate files def output_books_reviews(self, books_ids, consider_previous=True): if consider_previous: # Don't loop through already scraped books self.wr.consider_written_files(books_ids) # Show how many books are going to be scraped print(f"Scraping {len(books_ids)} Books") # Loop through book ids in array and scrape books for book_id in books_ids: self.output_book_reviews(book_id) # Scrape and write one book's reviews to a file def output_book_reviews(self, book_id): self._threads.clear() # Open book file and page by its Id self.br.open_book_page(book_id) self.wr.open_book_file(book_id) # Reset invalid reviews counter and page counter self._invalid = 0 # Scrape book meta data in first line self.run(self._scrape_book_meta, [book_id]) # Scrape first page of the book anyway self.run(self._scrape_book_reviews) no_next_page = False try: # Scrape the remaining pages while self._invalid < 60: # Go to next page if there's one in_next_page = self.br.goto_next_page() if no_next_page or not in_next_page: no_next_page = False # Switch to a different reviews mode if not self.br.switch_reviews_mode(book_id, in_next_page is None): # Break after switching to all modes break # Wait until requested book reviews are loaded if self.br.are_reviews_loaded(): # Scrape loaded book reviews self.run(self._scrape_book_reviews) else: no_next_page = True finally: # Wait until all threads are done [thread.join() for thread in self._threads] # Finalize file name and close it self.wr.close_book_file() # Scrape and write book and author data def _scrape_book_meta(self, html, book_id): # Create soup object and store book meta section of the page in soup soup = BeautifulSoup(html, "lxml").find(id="metacol") # If book is not found if not soup: print(f"*Book ID:\t{book_id:<15}Not Found!") # Close file and raise an error self.wr.close_book_file() raise FileNotFoundError # Get book title and remove spaces from it title = soup.find(id="bookTitle").get_text(". ", strip=True) # Get average rating of the book out of five rating = soup.find(class_="average").get_text() # Store author data section author = soup.find(class_="authorName") # Get author id from url id_ = author.get("href")[38:].split(".")[0] # Get author name name = author.find().get_text() # Write scraped meta data to file's first line self.wr.write_book_meta(book_id, title, rating, id_, name) # Display book id and title print(f"*Book ID:\t{book_id:<15}Title:\t{title}") # Scrape a single page's reviews def _scrape_book_reviews(self, html): # Store reviews section of the page in soup soup = BeautifulSoup(html, "lxml").find(id="bookReviews") # Loop through reviews individually for review in soup.find_all(class_="review"): try: # Get user / reviewer id user_id = review.find(class_="user").get("href")[11:].split("-")[0] # Get rating out of five stars stars = len(review.find(class_="staticStars").find_all(class_="p10")) # Get full review text even the hidden parts, and remove spaces and newlines comment = review.find(class_="readable").find_all("span")[-1].get_text(". ", strip=True) # Detect which language the review is in if detect(comment) != self._lang: # Count it as a different language review self._invalid += 1 continue # Get review date date = review.find(class_="reviewDate").get_text() # Skip the rest if one of the above is missing except Exception: # Count it as an invalid review self._invalid += 2 continue # If it's not a strike, reset the counter self._invalid = 0 # Get review ID review_id = review.get("id")[7:] # Write the scraped review to the file self.wr.write_review(review_id, user_id, date, stars, comment) # Add review id to ids print(f"Added ID:\t{review_id}") return True # Starts a scraping process on a new thread def run(self, method, args=[]): # Create a thread and add it to threads list then start it self._threads.append(SafeThread(target=method, args=[self.br.page_source] + args)) self._threads[-1].start() def reset(self): self.stop() self.start() print("Restarted Reviews") def stop(self): self.br.close() self.wr.delete_file() def close(self): self.br.quit() self.wr.close() self._threads.clear() print("Closed Reviews")