def __init__(self, debug=False): self._api = 'https://api.bitkub.com/api' self._key = os.getenv('BITKUB_API_KEY', '') self._secret = os.getenv('BITKUB_API_SECRET', '').encode() self._debug = debug self._browser = Browser(debug=self._debug)
def __init__(self, debug=False): self._api = 'https://api.satang.pro/api' self._uid = os.getenv('SATANG_USER_ID', '') self._key = os.getenv('SATANG_API_KEY', '') self._secret = os.getenv('SATANG_API_SECRET', '').encode('utf-8') self._debug = debug self._browser = Browser(debug=self._debug)
def __init__(self, host): """Init object with forum url (host) and Browser object.""" self.host = host try: self.browser = Browser() except HTTPError as e: print(e) sys.exit(1)
def setUp(self) -> None: self.driver = Browser("chrome", r".\tools\chromedriver.exe") self.driver.open_browser("http://www.baidu.com") logging.info("打开浏览器") logging.info( f"浏览器名称:{self.driver.browser_name},浏览器版本:{self.driver.browser_version}" ) self.homepage = HomePage(self.driver) self.newspage = NewsPage(self.driver)
class Platform(abc.ABC): def __init__(self, platform): self.browser = Browser() self.config = Config(platform) self.email_alert = EmailAlert() self.By = By @abc.abstractmethod def login(self): pass @abc.abstractmethod def get_account_value(self): pass @abc.abstractmethod def get_available_funds(self): pass def get_project_value(self, project_id): return 0 def get_name(self): return self.__class__.__name__ def get_username(self): return self.config.username def get_password(self): return self.config.password def get_account(self): return self.config.account def get_currency(self): return self.config.currency def has_projects(self): return self.config.has_projects() def get_projects(self): return self.config.projects def send_alert_email(self, platform_name, message): self.email_alert.send_email_alert(platform_name, message) def quit(self): self.browser.quit()
def setUp(self): self.driver = Browser().get_browserdriver() self.login_page = LoginPage(self.driver) self.config = Config().get('ZPC') self.login_page.url = self.config.get('url') self.login_page.visit() self.login_page.wait(5) self.login_page.set_value(element=self.login_page.rec_user_input(), text=self.config.get('user')) self.login_page.set_value(element=self.login_page.rec_passwd_input(), text=self.config.get('pwd')) self.main_page = self.login_page.click_login_btn()
class Baidu(unittest.TestCase): def setUp(self) -> None: self.driver = Browser("chrome", r".\tools\chromedriver.exe") self.driver.open_browser("http://www.baidu.com") logging.info("打开浏览器") logging.info( f"浏览器名称:{self.driver.browser_name},浏览器版本:{self.driver.browser_version}" ) self.homepage = HomePage(self.driver) self.newspage = NewsPage(self.driver) def tearDown(self) -> None: self.driver.quit() logging.info("关闭浏览器") def test_search(self): """测试百度搜索框输入selenium能搜索出包含selenium相关的信息""" logging.info("用例1:测试百度搜索框输入selenium能搜索出包含selenium相关的信息") # 输入搜索信息 self.homepage.input_box.send_keys("selenium") logging.info("输入搜索信息") # 点击按钮 self.homepage.search_button.click() logging.info("点击搜索按钮") time.sleep(2) # 校验搜索结果 els = self.driver.find_element_by_partial_link_text("selenium") self.assertIsNotNone(els) def test_access_game_news(self): """测试通过百度首页能进入新闻界面的游戏专题""" logging.info("用例2:测试通过百度首页能进入新闻界面的游戏专题") # 点击新闻链接 self.homepage.news_link.click() logging.info("点击新闻链接") # 切换窗口 self.driver.switch_to_new_page() logging.info("切换窗口") # 点击游戏链接 self.newspage.game_link.click() logging.info("点击游戏链接") # 校验url current_url = self.driver.current_url self.assertEqual(current_url, "http://news.baidu.com/game")
def runCrawl(limitNum=0, queryList=[], is_all_comments=False): browser = Browser("driver/chromedriver") for query in queryList: browser.clearLink() makeDir("data") makeDir("data/" + query) mUrl = "" if query[0] == "#": mUrl = "https://www.instagram.com/explore/tags/" + query[ 1:] + "/?hl=en" else: mUrl = "https://www.instagram.com/" + query + "/?hl=en" browser.goToPage(mUrl) print("collecting url of " + query + "...") browser.scrollPageToBottomUntilEnd(browser.collectDpageUrl, limitNum) print("finish scoll collecting!") print("collecting data...") slist = list(set(browser.urlList)) for url in tqdm(slist): dirName = url.split("/")[4] # skip if already crawled if not makeDir("data/" + query + "/" + dirName): continue browser.goToPage(url) if is_all_comments: browser.expandComments() cur = browser.getPageSource() writeToFile("data/" + query + "/" + dirName + "/raw.html", [cur]) infoData = cur.split("<meta content=")[1].split(" ") # extract data lang = extractLang(cur) likes = extractLikes(infoData, lang) comments = extractComments(infoData, lang) caption = extractCaption(cur) dateTime = extractDateTime(cur) commentMessages = extractCommentsMessage(cur) # print("likes:",likes," comments:", comments," caption:", caption, # "commentMessages:", commentMessages, "dateTime:", dateTime) writeToFile("data/" + query + "/" + dirName + "/info.txt", [ "likes: ", likes, "", "comments: ", comments, "", "caption: ", caption, "", "commentMessages: ", commentMessages, "", "dateTime: ", dateTime, "" ]) # download image imageUrl = cur.split( 'meta property="og:image" content="')[1].split('"')[0] downloadImage(imageUrl, "data/" + query + "/" + dirName + "/image.jpg") time.sleep(1) print("query " + query + " collecting finish") time.sleep(2) browser.driver.quit() print("FINISH!")
class BITKUB: """ Official Documentation for Bitkub APIs https://github.com/bitkub/bitkub-official-api-docs """ def __init__(self, debug=False): self._api = 'https://api.bitkub.com/api' self._key = os.getenv('BITKUB_API_KEY', '') self._secret = os.getenv('BITKUB_API_SECRET', '').encode() self._debug = debug self._browser = Browser(debug=self._debug) ########## # --- public api --- # Get ticker information. def ticker(self, sym=''): sym = '' if sym == '' else f'?sym={sym.upper()}' payload = {'url': self._api + f'/market/ticker{sym}'} return self._resp(self._browser.get(**payload)) # List open (bids/asks) orders. def get_bids_asks(self, sym='THB_BTC', lmt=10): """ :return: { 'asks': [[rate, amount], [174629, 0.00010107], ...], 'bids': [[rate, amount], [174629, 0.00010107], ...] } """ payload = { 'url': self._api + f'/market/depth?sym={sym.upper()}&lmt={lmt}' } return self._resp(self._browser.get(**payload)) # List open buy(bids) orders. def get_bids(self, sym='THB_BTC', lmt=10): """ :return: [[rate, volume, amount], [174629, 17.65, 0.00010107], ...] """ payload = { 'url': self._api + f'/market/bids?sym={sym.upper()}&lmt={lmt}' } return self._resp_order(self._resp(self._browser.get(**payload))) # List open sell(asks) orders. def get_asks(self, sym='THB_BTC', lmt=10): """ :return: [[rate, volume, amount], [175500, 928.14, 0.00528859], ...] """ payload = { 'url': self._api + f'/market/asks?sym={sym.upper()}&lmt={lmt}' } return self._resp_order(self._resp(self._browser.get(**payload))) ########## # --- private api --- # Get balances info def balance(self): payload = { 'url': self._api + '/market/balances', 'headers': self._build_headers(), 'data': self._build_sign({}) } return self._resp(self._browser.post(**payload)) # Create a sell order. def sell(self, **kwargs): """ :param kwargs: sym, amt, rat, typ """ payload = { 'url': self._api + '/market/place-ask', 'headers': self._build_headers(), 'data': self._build_sign(self._data_rules(**kwargs)) } return self._resp(self._browser.post(**payload)) # Create a buy order. def buy(self, **kwargs): """ :param kwargs: sym, amt, rat, typ """ payload = { 'url': self._api + '/market/place-bid', 'headers': self._build_headers(), 'data': self._build_sign(self._data_rules(**kwargs)) } return self._resp(self._browser.post(**payload)) ########## # utility def _build_headers(self): return { 'Accept': 'application/json', 'Content-Type': 'application/json', 'X-BTK-APIKEY': self._key, } def _build_sign(self, data): data['ts'] = nonce() data['sig'] = hmac.new(self._secret, self._json_encode(data).encode(), hashlib.sha256).hexdigest() return self._json_encode(data) @staticmethod def _data_rules(**kwargs): params = {} if 'sym' in kwargs: # symbol is upper case params['sym'] = kwargs['sym'].upper() if 'amt' in kwargs: # 0.10000000 is invalid, 0.1 is ok params['amt'] = format_float(kwargs['amt']) if 'rat' in kwargs: # (e.g 1000.00 is invalid, 1000 is ok) params['rat'] = format_float(kwargs['rat']) if 'typ' in kwargs: # limit or market params['typ'] = ('market' if kwargs['typ'] == 'market' else 'limit') return params def _resp(self, resp): if resp.status_code == 200: if resp.json().get('error') == 0: return resp.json()['result'] return resp.json() if self._debug: # catch error !? raise Exception(resp) @staticmethod def _resp_order(o): # idx, timestamp, volume, rate, amount return [[r, v, a] for i, t, v, r, a in o] @staticmethod def _json_encode(data): return json.dumps(data, separators=(',', ':'), sort_keys=True)
class PhpBB(object): """Class to interract with phpBB forum.""" delete_form_id = 'confirm' reply_url = 'posting.php?mode=reply&f={f}&t={t}' edit_url = 'posting.php?mode=edit&f={f}&p={p}' form_id = 'postform' private_mess_url = 'ucp.php?i=pm&mode=compose' def __init__(self, host): """Init object with forum url (host) and Browser object.""" self.host = host try: self.browser = Browser() except HTTPError as e: print(e) sys.exit(1) def __del__(self): """Close the session and delete object.""" try: self.browser.session.close() except HTTPError as e: print(e) sys.exit(1) def is_logged(self): """Check if logged in.""" u = self._get_user_id() if u != 1: print(f"login OK : {str(u)}") return True else: print(f"login failed : {str(u)}") return False def is_logged_out(self): """Check if logged out.""" u = self._get_user_id() if u != 1: print(f"Still logged in : {str(u)}") return True else: print(f"Signed out : {str(u)}") return False def _get_user_id(self): cookies = self.browser.list_cookies() for cookie in cookies: if re.search(cookie_u_pattern, cookie.name): return int(cookie.value) def _get_sid(self): cookies = self.browser.list_cookies() for cookie in cookies: if re.search(cookie_sid_pattern, cookie.name): sid = cookie.value return sid def login(self, username, password): """Log in phpBB forum.""" try: forum_ucp = urljoin(self.host, ucp_url) payload = self.browser.select_tag(forum_ucp, "input") # for key, value in payload.items(): # print(key, value) payload['username'] = username payload['password'] = password time.sleep(1) self.browser.post(forum_ucp, params=login_mode, data=payload) return self.is_logged() except HTTPError as e: print(e) return False def logout(self): """Log out of phpBB forum.""" try: # u_logout = Login(self.browser.session, self.host) # u_logout.send_logout() forum_ucp = urljoin(self.host, ucp_url) params = {'mode': 'logout', 'sid': self._get_sid()} self.browser.post( forum_ucp, # headers=headers, params=params) return self.is_logged_out() except HTTPError as e: print(e) return False def close(self): """Close request session (HTTP connection).""" try: self.browser.session.close() except HTTPError as e: print(e) sys.exit(1) def _get_post_text_area(self, url): try: soup = self.browser.get_html(url) return soup.find("textarea", id="message").text except HTTPError as e: print(e) except AttributeError as e: print("Error in _get_post_text_area") print(e) def _make_delete_confirm(self, url): form = self.browser.get_form(url, self.delete_form_id) form['values']['confirm'] = 'Oui' url = urljoin(self.host, form['action']) payload = form['values'] return url, payload def _make_reply_payload(self, url, message): form = self.browser.get_form(url, self.form_id) form['values']['message'] = message # form['values']['icon'] = 0 del form['values']['icon'] form['values']['post'] = 'Submit' url = urljoin(self.host, form['action']) payload = form['values'] return url, payload def _make_add_receiver_payload(self, url, receiver): form = self.browser.get_form(url, self.form_id) form['values']['username_list'] = receiver form['values']['add_to'] = "Ajouter" form['values']['addbbcode20'] = 100 del form['values']['icon'] url = urljoin(self.host, form['action']) payload = form['values'] return url, payload def _make_private_message_payload(self, url, subject, message): form = self.browser.get_form(url, self.form_id) form['values']['subject'] = subject form['values']['message'] = message form['values']['addbbcode20'] = 100 form['values']['address_list[u][8435]'] = "to" form['values']['icon'] = 0 # del form['values']['icon'] form['values']['post'] = 'Envoyer' url = urljoin(self.host, form['action']) payload = form['values'] return url, payload def get_post_text(self, postid): """Get text of a post.""" post = Post(postid, self) post.get_text() print(post.text) def get_post_editmode_content(self, forum, post): """Get text of a post as seen in edit mode.""" url = urljoin(self.host, self.edit_url.format(f=forum, p=post)) return self._get_post_text_area(url) def edit_post(self, forum, post, new_message): """Edit (modify) a message in a forum.""" url = urljoin(self.host, self.edit_url.format(f=forum, p=post)) try: form = self.browser.get_form(url, self.form_id) form['values']['icon'] = 0 form['values']['message'] = new_message form['values']['post'] = 'Submit' form['values']['topic_type'] = '0' # wait at least 2 seconds so phpBB let us post time.sleep(2) payload = form['values'] self.browser.session.post( url, # headers=headers, data=payload) except HTTPError as e: print(f'\n>>> Error {e.code}: {e.msg}') def get_forum_topics(self, f): """Retrieve and print all topics in a forum. Used in list-forum.py, for example. """ forum = Forum(f, self) forum.print_forum_title() forum.get_nb_topics() topics_list = forum.get_forum_topics() forum.print_topics() return topics_list def get_forum_view_topics(self, f): """Test get_forum_viewtopics().""" forum = Forum(f, self) forum.print_forum_title() return forum.get_forum_viewtopics() def get_topic_posts(self, viewtopic, max_count): start = 0 # topic executes get html, get nb message and get title on creation topic = Topic(self, viewtopic) topic.print40() pageurl = topic.make_topic_page_url(start) if topic.nb_messages < max_count: max_count = topic.nb_messages while start < max_count: # if (start + 10) < max_count: # count = 0 # else: # count = max_count - start page = Page(pageurl, self.browser.get_html(pageurl)) pagelist = page.get_page_posts() if not pagelist: break topic.postlist.extend(pagelist) start += 10 pageurl = topic.make_topic_page_url(start) return topic.postlist def get_topic_posts_with_url(self, viewtopic, txt, max_count): start = 0 # topic executes get html, get nb message and get title on creation topic = Topic(self, viewtopic) topic.print40() pageurl = topic.make_topic_page_url(start) if topic.nb_messages < max_count: max_count = topic.nb_messages while start < max_count: # if (start + 10) < max_count: # count = 0 # else: # count = max_count - start page = Page(pageurl, self.browser.get_html(pageurl)) pagelist = page.get_page_posts_with_url(txt) if not pagelist: break topic.postlist.extend(pagelist) start += 10 pageurl = topic.make_topic_page_url(start) return topic.postlist # Def get_topic_posts(self, f, t, max_count): def get_topic_posts_not_done(self, viewtopic, posts_done, max_count): """Return list of posts, not already done. Args: viewtopic (str): viewtopic url posts_done (int): number of posts already done max_count (int): max number Returns: PostList: list of posts not processed yet """ # topic executes get html, get nb message and get title on creation topic = Topic(self, viewtopic) topic.print40() if topic.nb_messages > posts_done: print(f"{posts_done} sur {topic.nb_messages} " f"messages déjà traités") start = posts_done pageurl = topic.make_topic_page_url(start) if topic.nb_messages < max_count: max_count = topic.nb_messages while start < max_count: # if (start + 10) < max_count: # count = 0 # else: # count = max_count - start page = Page(pageurl, self.browser.get_html(pageurl)) pagelist = page.get_page_posts() if not pagelist: break topic.postlist.extend(pagelist) start += 10 pageurl = topic.make_topic_page_url(start) return topic.nb_messages, topic.postlist else: print("already_done") return topic.nb_messages, topic.postlist # Get User on a topic def get_user_topic_posts(self, viewtopic, max_count): start = 0 # topic executes get html, get nb message and get title on creation topic = Topic(self, viewtopic) topic.print40() pageurl = topic.make_topic_page_url(start) if topic.nb_messages < max_count: max_count = topic.nb_messages while start < max_count: # if (start + 10) < max_count: # count = 0 # else: # count = max_count - start page = Page(pageurl, self.browser.get_html(pageurl)) pagelist = page.get_page_posts_with_user() if not pagelist: break topic.postlist.extend(pagelist) start += 10 pageurl = topic.make_topic_page_url(start) return topic.postlist def delete_post(self, post): """Delete one message. Send proper request.""" try: url_get = post.make_delete_req_url(self.host) print("delete : " + url_get) url_post, payload = self._make_delete_confirm(url_get) time.sleep(1) self.browser.session.post( url_post, # headers=headers, data=payload) except HTTPError as e: print(e) print("HTTPError with post : " + post.id) except TypeError as e2: print(e2) def delete_post_list(self, post_list): """Delete multiple messages (in a list).""" for post in post_list: self.delete_post(post) def get_topic_posts_with_user(self, viewtopic, max_count): start = 0 # topic executes get html, get nb message and get title on creation topic = Topic(self, viewtopic) topic.print40() if topic.nb_messages < max_count: max_count = topic.nb_messages while start < max_count: # print(urltopic) # if (start + 10) < max_count: # count = 0 # else: # count = max_count - start pageurl = topic.make_topic_page_url(start) page = Page(pageurl, self.browser.get_html(pageurl)) pagelist = page.get_page_posts_with_user() if not pagelist: break topic.postlist.extend(pagelist) start += 10 return topic.postlist def post_reply(self, forum, topic, message): """Send a reply.""" url = urljoin(self.host, self.reply_url.format(f=forum, t=topic)) urlrep, payload = self._make_reply_payload(url, message) print(urlrep) print(payload) time.sleep(2) self.browser.session.post( urlrep, # headers=headers, # params=self.login_mode, data=payload) def send_private_message(self, receiver, subject, message): """Send private message.""" url = urljoin(self.host, self.private_mess_url) urlrep1, payload1 = self._make_add_receiver_payload(url, receiver) urlrep2, payload2 = self._make_private_message_payload( url, subject, message) # noqa: E501 time.sleep(2) # Add receiver self.browser.session.post( urlrep1, # headers=headers, # params=self.login_mode, data=payload1) # Send message self.browser.session.post( urlrep2, # headers=headers, # params=self.login_mode, data=payload2)
def verify_page_title(self, title): assert title in Browser.title()
class SATANG: """ Official Documentation for Satang Pro APIs https://docs.satang.pro ~~~ wtf(p)!!!, where to find (python)!!! """ def __init__(self, debug=False): self._api = 'https://api.satang.pro/api' self._uid = os.getenv('SATANG_USER_ID', '') self._key = os.getenv('SATANG_API_KEY', '') self._secret = os.getenv('SATANG_API_SECRET', '').encode('utf-8') self._debug = debug self._browser = Browser(debug=self._debug) ########## # --- public api --- # Get ticker information. def get_bids_asks(self, sym='btc_thb'): """ :return: { 'asks': [[rate, amount], [174629, 0.00010107], ...], 'bids': [[rate, amount], [174629, 0.00010107], ...] } """ payload = {'url': self._api + f'/orders/?pair={sym.lower()}'} return self._resp_order(self._resp(self._browser.get(**payload))) ########## # --- private api --- # User def user(self): payload = { 'url': self._api + f'/users/:{self._uid}', 'headers': self._build_headers() } return self._resp(self._browser.get(**payload)) # Create a buy order. def buy(self, pair, price, amount, typ='limit'): data = { 'pair': pair.lower(), 'price': price, 'amount': amount, 'side': 'buy', 'type': ('limit' if typ == 'limit' else 'market'), 'nonce': nonce() } print(data) return self._create_orders(**data) # Create a sell order. def sell(self, pair, price, amount, typ='limit'): data = { 'pair': pair.lower(), 'price': price, 'amount': amount, 'side': 'sell', 'type': ('limit' if typ == 'limit' else 'market'), 'nonce': nonce() } return self._create_orders(**data) def _create_orders(self, **kwargs): data = self._concatenate_params(**kwargs) payload = { 'url': self._api + '/orders/', 'headers': self._build_headers(data), 'data': data } return self._resp(self._browser.post(**payload)) ########## # utility def _build_headers(self, s=''): return { 'Authorization': 'TDAX-API ' + self._key, 'Signature': hmac.new(self._secret, s.encode('utf-8'), hashlib.sha512).hexdigest(), } def _resp(self, resp): if resp.status_code == 200: return resp.json() if self._debug: # catch error !? raise Exception(resp) @staticmethod def _resp_order(o): return { 'bids': [[_['price'], _['amount']] for _ in o['bid']], 'asks': [[_['price'], _['amount']] for _ in o['ask']] } @staticmethod def _concatenate_params(**p): print(p) return'&'.join(sorted(['{}={}'.format(_, p[_]) for _ in p])) if p else ''
def setPath(self): """ Define current path with Browser (utils) """ file = Browser() self.get('dirname').set(file.get())
def find_elements(self, xpath): return Browser.get_driver().find_elements_by_xpath(xpath)
class Bovespa(object): """ Class responsible to manage the operations with the website of bovespa. """ def __init__(self): self.__browser = Browser() self.bovespa_url_base = 'http://bvmf.bmfbovespa.com.br' @staticmethod def _files_from_period(files_list, last_update, current_date=dt.today()): """ From all the files available in the website, selects only the ones between the range date ( last update, current date) :param files_list: List of the files available on the Bovespa website :param last_update: Last time the update was made :param current_date: Current time ( normally today ) :return: list of files to be download """ result = [] file_avaliable = None while last_update.date() <= current_date.date(): day = last_update.day month = last_update.month if day < 10: day = '0{0}'.format(last_update.day) if month < 10: month = '0{0}'.format(last_update.month) if last_update.year != current_date.year: file_avaliable = files_list.get('COTAHIST_A{year}.ZIP'.format(year=last_update.year)) last_update += relativedelta(years=1) last_update = last_update.replace(day=01) last_update = last_update.replace(month=01) elif (last_update.year == current_date.year) and (last_update.month != current_date.month): file_avaliable = files_list.get( 'COTAHIST_M{month}{year}.ZIP'.format(month=month, year=last_update.year)) last_update += relativedelta(months=1) elif (last_update.year == current_date.year) and (last_update.month == current_date.month): file_avaliable = files_list.get('COTAHIST_D{day}{month}{year}.ZIP'.format( day=day, month=month, year=last_update.year)) last_update += td(days=1) if file_avaliable: result.append(file_avaliable) return result def _available_files(self): """ Get all the file names available to download on the bovespa website. :return : List of available files in the website """ url = self.bovespa_url_base + '/pt-br/cotacoes-historicas/FormSeriesHistoricasArq.asp' page = self.__browser.get_page(url) files = bovespa_parser.parse_files_form(page) return files def select_files(self, start_dt, finish_dt=dt.now()): """ Select the files to be download to update the database :type start_dt: datetime.datetime :type finish_dt: datetime.datetime :return: List of files to download based on the start and finish date """ available_files = self._available_files() return self._files_from_period(available_files, start_dt, finish_dt) def download_file(self, file_name): """ Download a file from bovespa historic website, and returns uncompressed. :type file_name: str :return: A TXT file. """ url = self.bovespa_url_base + '/InstDados/SerHist/' downloaded_file = self.__browser.get_page(url + file_name) compressed_file = StringIO(downloaded_file) uncompressed_file = uncompress_zipfile(compressed_file) return uncompressed_file
def __init__(self, platform): self.browser = Browser() self.config = Config(platform) self.email_alert = EmailAlert() self.By = By
def runCrawl(limitNum=0, queryList=[], is_all_comments=False, userinfo={}): browser = Browser("driver/chromedriver") if userinfo != {}: print('Start logging in') browser.goToPage('https://www.instagram.com/accounts/login/?hl=en') if browser.log_in(userinfo): print('Success to log in') else: print('Fail to log in') return else: print('Continue Without logging in') for query in queryList: browser.clearLink() makeDir("data") makeDir("data/" + query) mUrl = "" if query[0] == "#": mUrl = "https://www.instagram.com/explore/tags/" + query[ 1:] + "/?hl=en" else: mUrl = "https://www.instagram.com/" + query + "/?hl=en" browser.goToPage(mUrl) print("collecting url of " + query + "...") browser.scrollPageToBottomUntilEnd(browser.collectDpageUrl, limitNum) print("finish scoll collecting!") print("collecting data...") slist = list(set(browser.urlList)) for url in tqdm(slist): dirName = url.split("/")[4] # skip if already crawled if not makeDir("data/" + query + "/" + dirName): continue browser.goToPage(url) if is_all_comments: browser.expandComments() cur = browser.getPageSource() writeToFile("data/" + query + "/" + dirName + "/raw.html", [cur]) infoData = BeautifulSoup(cur, "lxml") imageData = infoData.find("img", class_="FFVAD") # extract data likes = extractLikes(infoData) comments_list = extractComments(infoData) comments = comments_list.__len__() caption = extractCaption(imageData) dateTime = extractDateTime(infoData) commentMessages = extractCommentsMessage(comments_list) # print("likes:",likes," comments:", comments," caption:", caption, # "commentMessages:", commentMessages, "dateTime:", dateTime) writeToFile("data/" + query + "/" + dirName + "/info.txt", [ "likes: ", likes, "", "comments: ", comments, "", "caption: ", caption, "", "commentMessages: ", commentMessages, "", "dateTime: ", dateTime, "" ]) # download image imageUrl = imageData.get("srcset") downloadImage(imageUrl, "data/" + query + "/" + dirName + "/image.jpg") time.sleep(1) print("query " + query + " collecting finish") time.sleep(2) browser.driver.quit() print("FINISH!")
def go_to_url(self,url): Browser.go_to(url)
def __init__(self): self.driver = Browser.getDriver()
def __init__(self): self.__browser = Browser() self.bovespa_url_base = 'http://bvmf.bmfbovespa.com.br'
def teardown_function(function): Browser.quit()
def wait_until_visibile(self, locator): Browser.wait().until(EC.visibility_of_element_located(locator))