def fetch_course_soups(course_list: List[Dict[str, str]], cookies: Dict[str, str]) -> Dict[str, BeautifulSoup]: course_soups: Dict[str, BeautifulSoup] = {} for course in course_list: course_name = course["title"] href = course["href"] querys = parse_qs(urlparse(href).query) assert "id" in querys course_id = querys["id"][0] # res = requests.get(f"https://letus.ed.tus.ac.jp/course/info.php?id={course_id}") # syllabus_soup = BeautifulSoup(res.text, "html.parser") res = requests.get(href, cookies=cookies) assert res.status_code == 200 soup = BeautifulSoup(res.text, "html.parser") if __debug__: page_cache_dir = Path(hydra.utils.get_original_cwd()) / ".cache" / "pages" os.makedirs(page_cache_dir, exist_ok=True) save_html(res, page_cache_dir / f"{course_name}.html", "w") course_soups[course_name] = soup return course_soups
def bp_preprocessor(self): html = read(self.url, self.local) # Save the article locally if self.save and not self.local: title = self.url.replace("https://", "") title = title.replace("/", ":") save_html(html, "html_pages/" + title) soup = BeautifulSoup(html, "html.parser") r = soup.find('div', {"class": "entry_content"}) return r
def update_internal_server_state(self, previous_page): tree = html.fromstring(previous_page.content) forms = tree.xpath("//form[@id='FormBookingCreate']") if len(forms) and forms[0].attrib['action'].endswith('/4'): save_html(previous_page.content) self.log_step( '✅ Step 0 and 3: Accepted conditions and RDV type chosen') return next_button = tree.xpath("//input[@name='nextButton']") if next_button[0].value != "Etape suivante": save_html(page.content) raise Exception("Step 0: Dates not available :(") self.log_step('✅ Step 0: Accepted conditions') self.log_step( 'Step 3: Submitting form and implicitly choosing RDV type') page = self.post(f"{self.url_base}/3", {'nextButton': 'Etape suivante'}) tree = html.fromstring(page.content) etape4_active = tree.xpath("//img[contains(@src, '/etape_4r.png')]") if not len(etape4_active): save_html(page.content) raise Exception("Step 3: Dates not available :(") self.log_step('✅ Step 3: Submitted') save_html(page.content)
def choose_first_available(self): self.log_step('Step 4: Choosing the first timeslot available') page = self.post(f"{self.url_base}/4", {'nextButton': 'Première+plage+horaire+libre'}) tree = html.fromstring(page.content) etape6_active = tree.xpath("//img[contains(@src, '/etape_6r.png')]") if not len(etape6_active): save_html(page.content) raise Exception("Step 4: Dates not available :(") save_html(page.content) date_time = tree.xpath( "//*[@id='inner_Booking']/fieldset")[0].text_content() self.log_step('\n'.join( ['✅ Step 4: Chosen date', f'```{date_time}```']))
def accept_conditions(self): self.logger.info('Step 0: Validating conditions') page = self.post( self.url_start, { 'condition': 'on', 'nextButton': 'Effectuer+une+demande+de+rendez-vous', }) if not page: raise Exception('Conditions not accepted. Bad request') tree = html.fromstring(page.content) next_button = tree.xpath("//input[@name='nextButton']") if not len(next_button): save_html(page.content) raise Exception("Step 0: Next button not found") return page
def load_all_courses(driver, html_name, num_load=None): utils.open_url(driver, host + '/library/search') # switch to Course tab for elem in driver.find_elements_by_xpath('//li[@class="tab-list__item"]'): if elem.text == 'Courses': elem.click() # define target scraping section course_section = driver.find_element_by_xpath( '//div[@aria-selected="true"]') # expected number ncourse_expect = int( course_section.find_element_by_xpath( './/*[@class="l-search__results-page-info"]').text.split()[1]) nload = 0 if num_load: nload_max = num_load else: nload_max = (ncourse_expect // 25) + 3 while nload < nload_max: courses = course_section.find_elements_by_xpath( './/li[@class="courses-list__item"]') ncourses = len(courses) utils.print_message('#load={}, ncourses={}'.format(nload, ncourses)) nload += 1 buttons = course_section.find_elements_by_xpath( './/a[@class="button button--outlined"]') if len(buttons) == 0: break buttons[0].click() utils.wait(3) # save html utils.save_html(driver, html_name) course_list = course_section.find_elements_by_xpath( './/li[@class="courses-list__item"]') utils.print_message('expect {} courses, loaded {}.'.format( ncourse_expect, len(course_list)))
def req(self, method, url, max_retries=1, cookies=None, headers={}, data=None, first_attempt_with_proxy=True): proxy_url = self.get_next_proxy_url() if first_attempt_with_proxy else None last_response = None for attempt in range(0, max_retries): attempt_text = f'Attempt {attempt + 1}:' try: last_response = requests.request( method, url, cookies=cookies, headers={ **headers, **HttpClient.DEFAULT_HEADERS, }, data=data, timeout=HttpClient.REQ_TIMEOUT, proxies={'http': proxy_url, 'https': proxy_url} if proxy_url else None ) if not last_response.ok: self.logger.warning(f"{attempt_text} Failed with status code {last_response.status_code}") if last_response.status_code == 403: # delisting faulty proxy self.logger.warning(f"{attempt_text} Delisting faulty proxy {proxy_url}") self.proxies_queue.popleft() self.proxies.remove(proxy_url) with open(self.proxies_path, 'w') as f: f.writelines(list(self.proxies)) elif last_response.status_code == 502: self.logger.warning(f"{attempt_text} 502 - Bad Gateway") time.sleep(3) else: save_html(last_response.content) if attempt < max_retries - 1: # setting new proxy proxy_url = self.get_next_proxy_url() else: return last_response except Exception as ex: if type(ex) in [ReadTimeout, ProxyError]: self.logger.warning(f'{attempt_text} Timeout') else: self.logger.warning(f"{attempt_text}: Failed with exception:") self.logger.exception(ex) time.sleep(attempt + 1) return last_response
def book_date(self, g_captcha_response, form_data): page = self.post( f"{self.url_base}/6", { 'g-recaptcha-response': g_captcha_response, 'nextButton': 'Etape+suivante' }) tree = html.fromstring(page.content) next_button = tree.xpath("//input[@name='nextButton']") if not len(next_button): save_html(page.content) raise Exception('☠️ Step 6: Next button not found') if next_button[0].value != "Etape suivante": save_html(page.content) raise Exception("☠️ Step 6: Dates not available :(") self.log_step('✅ Step 6: Anticaptcha accepted') user_email = form_data['email'] self.log_step(f'Step 8: Submitting form for `{user_email}`') page = self.post(f"{self.url_base}/8", { **form_data, 'nextButton': 'Etape+suivante' }) tree = html.fromstring(page.content) message_sent = tree.xpath( "//li[contains(text(), 'Vous disposez de') and contains(text(), 'minutes pour confirmer')]" ) if not len(message_sent): save_html(page.content) self.log_step('☠️ Step 8: Not submitted :(') raise Exception('☠️ Step 8: Message not sent') self.log_step('✅ Step 8: Submitted. Check email `{user_email}`')
def topics_up(self, topics_list, content=['顶', '顶帖', '自己顶', 'waiting']): ''' Randomly select a content and reply a topic. ''' if not self.auth.ck: logger.error('ck is invalid!') return False # For example --> topics_list = ['22836371','98569169'] for topics_id in topics_list: post_data = { "ck": self.auth.ck, "rv_comment": random.choice(content), "img": "(binary)", "start": "0", "submit_btn": "发送" } url = DOUBAN_ADD_COMMENT.format(topic_id=topics_id) print(need_captcha(self.auth, url+"/?start=0")) captcha_id, captcha_url = need_captcha(self.auth, url+"/?start=0") if captcha_id: logger.info("The captcha_image url address is %s" % captcha_url) vcode = raw_input('图片上的验证码是:') post_data["captcha-solution"] = vcode post_data["captcha-id"] = captcha_id print(post_data) r = self.auth.session.post(url, post_data, cookies=self.auth.session.cookies.get_dict()) if DEBUG: save_html("topics_up.html", r.text) if r.status_code == 200: logger.info('Okay, already up ' + topics_id + ' topic') logger.info(r.status_code) logger.info(str(topics_list.index(topics_id) + 1).join(['Waiting for ', ' ...'])) time.sleep(20) # Wait a minute to up next topic, You can modify it to delay longer time return True
def fetch_page(self, y, m, d): """ Constructs an url and checks if it contains a page. :param y: year :param m: month :param d: day :return: a list of dicts { 'url': url, 'title': title, 'date': date object } """ url = self.baseurl + "/" + dts(y) + "/" + dts(m) + "/" + dts(d) + "/" response = read(url, self.local) if (response == "empty"): return response else: logger.write_log("visiting: " + url) title = url.replace("https://", "") title = title.replace("/", ":") # Save the article locally if self.save and not self.local: save_html(response, "html_collection_pages/" + title) articles = self.fetch_articles(response) for article in articles: article['date'] = dt.date(y, m, d) return articles
def check_planning_dates(self, planning_id, planning_title): self.logger.debug(f'Step 1: Checking planning {planning_id}') page = self.post(f"{self.url_base}/1", { 'planning': str(planning_id), 'nextButton': 'Etape suivante', }, first_attempt_with_proxy=True) if not page: return page tree = html.fromstring(page.content) etape3_active = tree.xpath("//img[contains(@src, '/etape_3r.png')]") etape4_active = tree.xpath("//img[contains(@src, '/etape_4r.png')]") if len(etape3_active) or len(etape4_active): self.log_step(f'✅ Step 1: Dates available for "{planning_title}"') save_html(page.content) return page finish_button = tree.xpath("//input[@name='finishButton']") if finish_button[0].value == "Terminer": self.logger.debug(f'Step 1: No dates {planning_title}') return page self.log_step( f'❓ Step 1: Anomaly detected for {planning_title}. Dumping html.') save_html(page.content) return page
# # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import os import shutil import utils SOURCE_PATH = 'test/' BUILD_PATH = 'build/' if os.path.lexists(BUILD_PATH): shutil.rmtree(BUILD_PATH) shutil.copytree(SOURCE_PATH, BUILD_PATH) md_files = utils.get_files_tree(BUILD_PATH, 'md') utils.save_html.setup(html_css='markdown.css') for item in md_files: dest = item[:-2] + 'html' print item, '==>', dest with open(item, 'rb') as fp: model = utils.parse_md(fp) with open(dest, 'wb') as res_fp: utils.save_html(res_fp, model) os.remove(item)
def check_once(self): unread_msgs = self.service.users().messages().list( userId='me', labelIds=['UNREAD', 'INBOX'], q='subject:"Demande de rendez-vous en attente de confirmation"' ).execute() if unread_msgs['resultSizeEstimate'] <= 0: self.logger.info('No new messages') return self.log_step( f'Step 9: Found {len(unread_msgs["messages"])} new messages') for msg in unread_msgs['messages']: try: msg_raw = self.service.users().messages().get( userId='me', id=msg['id'], format="raw", metadataHeaders=None).execute() msg = message_from_bytes( base64.urlsafe_b64decode(msg_raw['raw'])) msg_payload = msg.get_payload() msg_html = html.fromstring(msg_payload) confirm_link = msg_html.xpath( '//a[contains(@href, "booking/confirm")]') confirm_url = confirm_link[0].attrib['href'] self.log_step( f'Step 10: Following confirmation url: `{confirm_url}`') page = requests.get(confirm_url, headers=HttpClient.DEFAULT_HEADERS) if not page: save_html(page.content) raise Exception( f'☠️ Step 10: Url was not loaded. Status code: {page.status_code}' ) self.log_step( f'✅ Step 10: Confirmation page loaded `{confirm_url}`') tree = html.fromstring(page.content) confirm_form = tree.xpath("//*[@id='inner_Booking']/form") if not len(confirm_form): save_html(page.content) raise Exception('☠️ Confirmation form not found') confirm_url = f"{self.config.url}{confirm_form[0].attrib['action']}" confirm_btn = tree.xpath("//input[@name='createButton']") if not len(confirm_btn): save_html(page.content) raise Exception('☠️ Confirmation button not found') self.log_step( f'Step 11: Submitting confirmation to: `{confirm_url}`') page = requests.post( confirm_url, headers={ **HttpClient.DEFAULT_HEADERS, 'referer': confirm_url, 'origin': confirm_url }, cookies=page.cookies, data={'createButton': confirm_btn[0].value}) save_html(page.content) if not page: raise Exception( f'☠️ Step 11: Confirmation not submitted. Status code: {page.status_code}' ) self.log_step(f'✅ RDV fully confirmed!') except Exception as ex: self.log_step(f'☠️ Exception: {ex}') self.logger.exception(ex)
def process_html( html: str, store_url: str, html_path: str, full_html_path: str, json_path: str, title: Optional[str] = None, image: Optional[str] = None, ) -> Optional[str]: try: soup = BeautifulSoup(html, "lxml") data = {"path": html_path} if image is None: # Let's see if header has img metadata og_img = soup.find("meta", attrs={"property": "og:image"}) twitter_img = soup.find("meta", attrs={"name": "twitter:image"}) if og_img is None and twitter_img is None: # Header has no img metadata. og_img_idx = html.find("og:image") if og_img_idx != -1: idx_s = og_img_idx + 11 idx_e = html.find('"', idx_s) if idx_e == -1: idx_e = html.find("'", idx_s) image = html[og_img_idx + 11:idx_e] else: twitter_img_idx = html.find('name="twitter:image"') if twitter_img_idx != -1: idx_s = og_img_idx + 16 idx_e = html.find('"', idx_s) if idx_e == -1: idx_e = html.find("'", idx_s) image = html[og_img_idx + 11:idx_e] else: return "no_image_metadata" else: if og_img is not None: image = og_img.attrs["content"] else: image = twitter_img.attrs["content"] data["image"] = image description = soup.find("meta", attrs={"property": "og:description"}) if description is None: description = soup.find("meta", attrs={"name": "twitter:description"}) if description is None: description = soup.find("meta", attrs={"name": "description"}) data["description"] = description.attrs["content"].strip().replace( "\n", " ") found_title = soup.find("meta", attrs={ "property": "og:title" }).attrs["content"] if found_title is None: found_title = soup.find("meta", attrs={ "name": "twitter:title" }).attrs["content"] if found_title is None: found_title = soup.find("title").text if found_title is None and title is None: return "no_title_metadata" if found_title is not None: title = found_title final_html_with_body = process_tree(html) title_arr = typed.List(title.split()) extracted_text, unbranded_title = extract_all_text( final_html_with_body, title_arr) final_html = get_rid_of_the_body(final_html_with_body) if unbranded_title: data["title"] = unbranded_title else: data["title"] = title data["extracted_text"] = extracted_text data["store_url"] = store_url asyncio.run(save_json(json_path, data)) asyncio.run(save_html(html_path, final_html)) asyncio.run(save_html(full_html_path, html)) print(f"Finished processing {html_path}.") return None except Exception: print(f"Failed at processing {html_path}.") return None
def main(_args): agent_type = _args.agent results_path = _args.results_path + f"_{agent_type}" results = [] environment_settings = {} best_agent, best_eval_score = None, float('-inf') if agent_type == "reinforce": from scripts.agent_reinforce import REINFORCE AGENT = REINFORCE environment_settings = { 'env_name': 'CartPole-v1', 'gamma': 1.00, 'max_minutes': 10, 'max_episodes': 10000, 'goal_mean_100_reward': 475 } inner_wrappers = None elif agent_type == "vpg": from scripts.agent_vpg import VPG AGENT = VPG environment_settings = { 'env_name': 'CartPole-v1', 'gamma': 1.00, 'max_minutes': 10, 'max_episodes': 10000, 'goal_mean_100_reward': 475 } inner_wrappers = None elif agent_type == "sac": from scripts.agent_sac import SAC AGENT = SAC environment_settings = { 'env_name': 'HalfCheetah-v2', 'gamma': 0.99, 'max_minutes': 300, 'max_episodes': 10000, 'goal_mean_100_reward': 2000 } inner_wrappers = [utils.RenderUint8] else: raise NotImplementedError("Other Agent types are not supported yet") utils.create_directory(results_path) for seed in SEEDS: env_name, gamma, max_minutes, \ max_episodes, goal_mean_100_reward = environment_settings.values() agent = AGENT() make_env_fn, make_env_kargs = utils.get_make_env_fn( env_name=env_name, inner_wrappers=inner_wrappers) result, final_eval_score, training_time, wallclock_time = agent.train( make_env_fn, make_env_kargs, seed, gamma, max_minutes, max_episodes, goal_mean_100_reward) results.append(result) if final_eval_score > best_eval_score: best_eval_score = final_eval_score best_agent = agent results = np.array(results) # dim: (num_seeds, max_episode, 5) _ = BEEP() # Agent Progression html_data, title = best_agent.demo_progression() utils.save_html(data=html_data, path=os.path.join(results_path, f"{title}.html")) # Best Agent html_data, title = best_agent.demo_last() utils.save_html(data=html_data, path=os.path.join(results_path, f"{title}.html")) # Extracting statistics """ Over multiple seeds: total_steps, mean_100_reward, mean_100_eval_score, training_time, wall_clock_elapsed result is initialized with nan values and with fixed max episode length; so, mean/max/min will calculate until common episode length (min episode length for different seeds) """ agent_max_t, agent_max_r, agent_max_s, \ agent_max_sec, agent_max_rt = np.max(results, axis=0).T agent_min_t, agent_min_r, agent_min_s, \ agent_min_sec, agent_min_rt = np.min(results, axis=0).T agent_mean_t, agent_mean_r, agent_mean_s, \ agent_mean_sec, agent_mean_rt = np.mean(results, axis=0).T agent_x = np.arange( len(agent_mean_s)) # x axis values (episode numbers): 0, 1, 2, ... # Plot Statistics fig, axs = plt.subplots(5, 1, figsize=(20, 30), sharey=False, sharex=True) axs[0].plot(agent_max_r, 'b', linewidth=1) axs[0].plot(agent_min_r, 'b', linewidth=1) axs[0].plot(agent_mean_r, 'b', label=agent_type.upper(), linewidth=2) axs[0].fill_between(agent_x, agent_min_r, agent_max_r, facecolor='b', alpha=0.3) axs[1].plot(agent_max_s, 'b', linewidth=1) axs[1].plot(agent_min_s, 'b', linewidth=1) axs[1].plot(agent_mean_s, 'b', label=agent_type.upper(), linewidth=2) axs[1].fill_between(agent_x, agent_min_s, agent_max_s, facecolor='b', alpha=0.3) axs[2].plot(agent_max_t, 'b', linewidth=1) axs[2].plot(agent_min_t, 'b', linewidth=1) axs[2].plot(agent_mean_t, 'b', label=agent_type.upper(), linewidth=2) axs[2].fill_between(agent_x, agent_min_t, agent_max_t, facecolor='b', alpha=0.3) axs[3].plot(agent_max_sec, 'b', linewidth=1) axs[3].plot(agent_min_sec, 'b', linewidth=1) axs[3].plot(agent_mean_sec, 'b', label=agent_type.upper(), linewidth=2) axs[3].fill_between(agent_x, agent_min_sec, agent_max_sec, facecolor='b', alpha=0.3) axs[4].plot(agent_max_rt, 'b', linewidth=1) axs[4].plot(agent_min_rt, 'b', linewidth=1) axs[4].plot(agent_mean_rt, 'b', label=agent_type.upper(), linewidth=2) axs[4].fill_between(agent_x, agent_min_rt, agent_max_rt, facecolor='b', alpha=0.3) # ALL axs[0].set_title('Moving Avg Reward (Training)') axs[1].set_title('Moving Avg Reward (Evaluation)') axs[2].set_title('Total Steps') axs[3].set_title('Training Time') axs[4].set_title('Wall-clock Time') plt.xlabel('Episodes') axs[0].legend(loc='upper left') fig.savefig( os.path.join(results_path, f"{agent_type.upper()}_Statistics.png")) plt.show() # Saving Statistics statistics_dict = { "x": agent_x, "max_r": agent_max_r, "min_r": agent_min_r, "mean_r": agent_mean_r, "max_s": agent_max_s, "min_s": agent_min_s, "mean_s": agent_mean_s, "max_t": agent_max_t, "min_t": agent_min_t, "mean_t": agent_mean_t, "max_sec": agent_max_sec, "min_sec": agent_min_sec, "mean_sec": agent_mean_sec, "max_rt": agent_max_rt, "min_rt": agent_min_rt, "mean_rt": agent_mean_rt } utils.save_data(data=statistics_dict, path=os.path.join(results_path, f"{agent_type.upper()}_Statistics_Dict"))
def login_get_cookies(): cookies = utils.load_cookies() if cookies: session = requests.Session() session.headers['User-Agent'] = User_Agent session.cookies = requests.utils.cookiejar_from_dict(cookies) return session,cookies chrome_options = Options() chrome_options.add_argument("--headless") driver = webdriver.Chrome(executable_path='./chromedriver_win32/chromedriver.exe') url = 'https://accounts.douban.com/passport/login' while 1: driver.get(url) link = driver.find_element_by_xpath("//ul[@class='tab-start']/li[@class='account-tab-account']") link.click() name_input = driver.find_element_by_xpath("//div[@class='account-form-field']/input[@id='username']") pass_input = driver.find_element_by_xpath("//div[@class='account-form-field']/input[@id='password']") remember_input = driver.find_element_by_xpath("//div[@class='account-form-ft']/p[@class='account-form-remember']/input[@id='account-form-remember']") login_botton = driver.find_element_by_xpath("//div[@class='account-form-field-submit ']/a[@class='btn btn-account']") name_input.clear() name_input.send_keys(username) pass_input.clear() pass_input.send_keys(password) remember_input.click() login_botton.click() start_ts = time.time() print("start..",start_ts) try: WebDriverWait(driver,15).until_not(lambda x:x.find_element_by_xpath("//div[@class='account-form-field-submit ']/a[@class='btn btn-account btn-active']").is_displayed()) WebDriverWait(driver, 15).until_not(lambda x: x.find_element_by_xpath("//div[@class='account-form-field-submit ']/a[@class='btn btn-account']").is_displayed()) #wait until login except: import tracback print(tracback.format_exc()) utils.save_html('exc_inex.html',driver.page_source) import os os._exit(-1) print ('end..',time.time()-start_ts) driver.save_screenshot('submit.png') utils.save_html('index.html',driver.page_source) if u'douban' in driver.page_source: selenium_cookies = driver.get_cookies() print ("selenium_cookies:",selenium_cookies) driver.close() break else: driver.close() #handle cookies session = requests.Session() session.headers['User-Agent'] = User_Agent for i in selenium_cookies: requests.utils.add_dict_to_cookiejar(session.cookies, {i['name']: i['value']}) cookies = requests.utils.dict_from_cookiejar(session.cookies) utils.save_cookies(cookies) return session,cookies