def example1(): '''两大基本找tag的方式 1. find 2. findAll (py2中是findAll, 但是由于py3的新的命名标准, 被更改为了find_all。但是原来的findAll依然可用 这两种method的本质是一模一样的) ''' url = 'http://www.cvs.com/stores/cvs-pharmacy-locations' spider = Crawler() html = spider.html(url) if html: soup = BS4(html) # print soup.find('li') # just find the first one # print soup.find_all('li') # find a list of tag match your search print soup.find_all('li', limit=3) # find first 3 tag match your search
def parse_cost_from_html(html): """get average bill cost value from html """ try: soup = BS4(html) article = soup.find("article", class_="sub_details") h3 = article.find("h3") strong = h3.find("strong") text = strong.text.strip() text = text.replace("$", "") text = text[:-1] cost = float(text) return cost except: return None
def parser_step_1() -> list: '''функция первичного парсинга и получения списка животных с первой страницы''' req = requests.get( 'https://ru.wikipedia.org/wiki/%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:' '%D0%96%D0%B8%D0%B2%D0%BE%D1%82%D0%BD%D1%8B%D0%B5_%D0%BF%D0%BE_' '%D0%B0%D0%BB%D1%84%D0%B0%D0%B2%D0%B8%D1%82%D1%83') html = req.text parser = BS4(html, "html.parser") elements = parser.select("ul") list_animals = elements[2].text.split('\n') while True: list_animals += parser_step_2(list_animals) if 'Ящурки' == list_animals[-1]: break return list_animals
def parser_step_2(list_animals) -> list: '''Функция вторичного парсинга и получения оставшихся страничек с животными''' element = list_animals[-1].split() result = '+'.join(element) req = requests.get( f"https://ru.wikipedia.org/w/index.php?title=Категория:Животные_по_алфавиту&pagefrom={result}" ) html = req.text parser = BS4(html, "html.parser") elements = parser.select("ul") tmp_list_animals = elements[2].text.split('\n') if 'ЖивотныеОрганизмы по алфавиту' != elements[3].text: return tmp_list_animals + elements[3].text.split('\n') return tmp_list_animals
def property_detail_by_street_and_zipcode(driver, street, zipcode): """ENTER text in a input box""" driver.get("http://www.zillow.com/") # 再次打开页面 elem = driver.find_element_by_name("citystatezip") # 定位到 name = "q" 的query搜索框 elem.send_keys("%s %s" % (street, zipcode)) # 在文本框内输入内容 elem.send_keys(Keys.RETURN) # 把内容send出去(相当于点击了"search") html = driver.page_source soup = BS4(html) a = soup.find("a", class_ = "routable mask hdp-link") url = "http://www.zillow.com" + a["href"] driver.get(url) html = driver.page_source return html
def parse(response): base_domain = response.url r = response.content soup = BS4(r, 'lxml') li_list = soup.select('#p_left ul')[0].find_all('li') for li in li_list: li_tag = li.a.text content = li.p.text a_list = soup.select('#p_left ul')[1].find_all('a') page_format = a_list[-1]['href'][:-5].split('_') page_last, page_sec = page_format[-1], page_format[1] for i in range(1, int(page_last) + 1): link = base_domain + 'list_' + page_sec + '_' + str(i) + '.html' link_set.add(link) [parse_set(link) for link in link_set]
def lostfilm_tv_scanner(url): # done 20190211 html = gethtml(url.get(), True) soup = BS4(html, "html5lib") webname = soup.find("div", class_='title-ru').text or "parsing_error" webname += " / " + soup.find("div", class_='title-en').text or "parsing_error" website = "lostfilm.tv" textready = soup.find("div", class_="details").text or "parsing_error" text = textready.split("серий: ", 1)[1].split(" ")[0] webready = safeint(text) or -1 canscan = "yes" scanstatus = "good" if "parsing_error" != webname and webready > -1 else "bad" or "parsing_error" return webname, website, canscan, scanstatus, webready
def crawel_wds_url(): url = "https://wds.modian.com/ajax/comment_list" headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } payload = {'page': '1', 'post_id': '17190', 'pro_id': '8098'} r1 = requests.post(url, data=payload, headers=headers) raw_text = r1.text html_doc_1 = str(json.loads(raw_text)['data']) soup = BS4(html_doc_1, "html.parser") return soup
def green_tea_scanner(url): # done 20190211 html = gethtml2(url) # html = gethtml(url,True) soup = BS4(html, "html5lib") webname = soup.find("meta", itemprop="name")["content"] or "parsing_error" website = "green-teatv.com" webready = -1 for div in soup.find_all("div",class_="info-label"): if div.text == "Длительность:": webready = safeint(div.findNext("div",class_="info-desc").text.split("из")[0]) or -1 break canscan = "yes" scanstatus = "good" if "parsing_error" != webname and webready > -1 else "bad" or "parsing_error" return webname, website, canscan, scanstatus, webready
def parse_overview_data(self): data = [] soup = BS4(self.page_source, "html.parser") if(soup.find('h1', {'class': ' strong tightAll'})): name = soup.find('h1', {'class': ' strong tightAll'}).text.strip() overview = soup.find('div', {'class': 'info flexbox row col-hh'}) infos = overview.find_all('div', {'class': 'infoEntity'}) website = '' hq = '' size = '' founded = '' industry = '' revenue = '' for info in infos: if 'Website' in info.text: website = info.find('a', {'class': 'link'}).text.strip() elif 'Headquarters' in info.text: hq = info.find('span', {'class': 'value'}).text.strip() elif 'Size' in info.text: size = info.find('span', {'class': 'value'}).text.strip() elif 'Founded' in info.text: founded = info.find('span', {'class': 'value'}).text.strip() elif 'Type' in info.text: type = info.find('span', {'class': 'value'}).text.strip() elif 'Industry' in info.text: industry = info.find('span', {'class': 'value'}).text.strip() elif 'Revenue' in info.text: revenue = info.find('span', {'class': 'value'}).text.strip() #print"***print info over" review = { 'Name':name, 'Website': website, 'Headquarters': hq, 'Size': size, 'Founded': founded, 'Type': type, 'Industry': industry, 'Revenue': revenue } data.append(review) return data
def __init__(self, registry: dict): """ Virtually private constructor, missing docstring """ if ServiceNowSession.instance != None: raise Exception("This is a singleton object") else: session = requests.session() request = session.get(ServiceNowSession.__snow_url) auth_payload = registry request_1 = session.post(ServiceNowSession.__login_url, data=auth_payload) soup = BS4(request_1.text, 'html.parser') if 'Establishing session' in soup.find_all( "h1", class_="loading-message")[0]: ServiceNowSession.instance = session else: ServiceNowSession.instance = None
def zillow_property_detail(address, zipcode): url = gen_url(address, zipcode) # generate query's http url spider = Crawler() html = spider.html(url) # fetch html if html: # if good html, analysis it try: soup = BS4(html) dt = soup.find("dt", class_="property-data") info = dt.text.strip() span = soup.find("span", itemprop="addressLocality") city = span.text.strip() span = soup.find("span", itemprop="addressRegion") state = span.text.strip() return address, city, state, zipcode, info except: # if something wrong in analysis, raise ExtractorError raise ExtractorError(address, zipcode, url) else: # if bad html, raise HttpError raise HttpError(address, zipcode, url)
def anilibria_tv_scanner(url): # done 20190211 html = gethtml2(url) # html = gethtml(url,True) soup = BS4(html, "html5lib") webname = soup.find("h1", class_='title-rus').text or "parsing_error" webname +=" / "+ soup.find("h3", class_='title-original').text or "parsing_error" website = "anilibria.tv" textready = soup.find("div", class_='torrent-first-col').span.text or "parsing_error" text = textready.split(" ",1)[1] text = text.split(" ")[0].split("[")[0] if "-" in text: text=text.split("-")[1] webready = safeint(text) or -1 canscan = "yes" scanstatus = "good" if "parsing_error" != webname and webready > -1 else "bad" or "parsing_error" return webname, website, canscan, scanstatus, webready
def shikimori_org_scanner(url): # done 20190211 html = gethtml(url.get()) soup = BS4(html, "html5lib") webname = soup.find("header", class_="head").meta["content"] or "parsing_error" website = "shikimori.org" maxready = 0 span_ep_num = soup.find_all('span', class_="episode-num") for span_ep in span_ep_num: nextSpan = span_ep.findNext('span', class_="episode-kinds").string epNum = safeint(span_ep.string.split("#")[1]) or -1 if "озвучка" in nextSpan and epNum > maxready: maxready = epNum webready = maxready canscan = "yes" scanstatus = "good" if "parsing_error" != webname and webready > -1 else "bad" or "parsing_error" return webname, website, canscan, scanstatus, webready
def clean_up(message_bodies, nlon, nlon_model): cleansed = list() words_number = 0 words_limit = 10000 for message_body in message_bodies: try: soup = BS4(message_body, 'html.parser') clean_message_body = soup.text except Exception as e: logger.error('Error with BS4 on text:\n\n%s\n\n' % message_body, str(e)) clean_message_body = message_body.strip() clean_message_body = re.sub(r'^\s*>+( .*)?', '', clean_message_body, flags=re.MULTILINE) clean_message_body = re.sub(r'^\s*\+', '', clean_message_body, flags=re.MULTILINE) clean_message_body = re.sub(r'^\s*---\+', '', clean_message_body, flags=re.MULTILINE) clean_message_body = re.sub(r'\n[\t\s]*\n+', '', clean_message_body, flags=re.MULTILINE) clean_message_body = re.sub(r'({+|}+|\++|_+|=+|-+|\*|\\+|/+|@+|\[+|\]+|:+|<+|>+|\(+|\)+)', '', clean_message_body, flags=re.MULTILINE) clean_message_body = re.sub(r'On\s(.[^\sw]*\s)*wrote', '', clean_message_body, flags=re.MULTILINE) clean_message_body = re.sub(r'[\n+]Sent from', '', clean_message_body, flags=re.MULTILINE) clean_message_body = re.sub(r'https?:\/\/\S*', '', clean_message_body, flags=re.MULTILINE) clean_message_body = re.sub(r'[\w\.-]+ @ [\w\.-]+', '', clean_message_body, flags=re.MULTILINE) # clean_message_body = clean_message_body.encode('utf-8').strip() message_by_lines = clean_message_body.splitlines() list_length = len(message_by_lines) index = 0 for count in range(0, list_length): text = robjects.StrVector([message_by_lines[index]]) if nlon.NLoNPredict(nlon_model, text)[0] == 'Not': del message_by_lines[index] else: index = index + 1 clean_message_body = '\n'.join(message_by_lines) split_message = clean_message_body.split() words_number += len(split_message) if words_number > words_limit: split_message = split_message[:(words_limit - words_number)] clean_message_body = ' '.join(split_message) cleansed.append(clean_message_body.strip()) break cleansed.append(clean_message_body.strip()) return cleansed
def anistar_me_scanner(url): # done 20190211 r = urllib.request.Request(url ,headers={'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'}) # r = urllib.request.Request(url, data='cmd=date +%Y%m%d',headers={'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'}) html = gethtml2(r) if html:pass else: r = urllib.request.Request(url ,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0'}) # r = urllib.request.Request(url, data='cmd=date +%Y%m%d',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0'}) html = gethtml2(r) soup = BS4(html, "html5lib") webname = soup.find("h1", itemprop="name").string or "parsing_error" website = "anistar.me" webready = soup.find("p", class_="reason").string webready = safeint(re.findall('\d+', webready )[0]) or -1 canscan = "yes" scanstatus = "good" if "parsing_error" != webname and webready > -1 else "bad" or "parsing_error" return webname, website, canscan, scanstatus, webready
def mytask(bot): #jz信息实时更新 global base url1 = "https://wds.modian.com/show_weidashang_pro/5329#1" r1 = requests.get(url1, verify=False, headers=headers) html_doc_1 = r1.text soup1 = BS4(html_doc_1, "html.parser") gl = bot.List('group', '556592071') if gl is not None: for group in gl: result = return_top() print("首位" + result) if (result != base): print("before:" + base) base = result print("after:" + base) bot.SendTo(group, return_ans(result)) else: print("equal_before:" + base) print("equal_after:" + base)
def scrape_menu(meal_id, date): meal_id = str(meal_id) menu_dict = {} food_list = [] url = 'https://tamuk.campusdish.com/Commerce/Catalog/Menus.aspx?LocationId=6532&PeriodId=' + meal_id + '&MenuDate' \ + date res = requests.get(url) res.raise_for_status() soup = BS4(res.text, 'html.parser') for group in soup.select('.menu-details-station'): category = group.find('h2').text food_items = group.select('.menu-details-station-item .menu-name a') for item in food_items: food_list.append(item.text) copy_list = food_list[:] menu_dict[category] = copy_list return menu_dict
def level3(url, spider): base_url = "http://high-schools.com" html = spider.html(base_url + url) soup = BS4(html) table = soup.find( "table", class_="table table-striped table-hover table-condensed table-sortable" ) tbody = table.find("tbody") for tr in tbody.find_all("tr"): url = tr.td.a["href"] info = { key: td.text.strip() for key, td in zip([ "school_name", "type", "students", "student_to_teacher_ratio", "free_or_reduced_lunch", "school_distinct" ], tr.find_all("td")) } yield url, info
def return_ans(result): url1 = "https://wds.modian.com/show_weidashang_pro/5329#1" r1 = requests.get(url1, verify=False, headers=headers) html_doc_1 = r1.text soup1 = BS4(html_doc_1, "html.parser") nick_sup = [] money_sup = [] rg = soup1.find("div", class_="b").get_text() num = int(rg[0:3]) print(num) people = num if num >= 20: num = 20 print(people) fond = 14968.42 res = soup1.find_all( "div", class_="mon current")[0].find_all("span")[1].get_text() res = res[1:] res = float(res.replace(',', "")) sub = round(fond - res, 2) if sub <= 0: sub = 0 for i in range(num): nick = soup2.find_all("span", "nickname")[i].get_text() nick_sup.append(nick) money = soup1.find_all("span", "nick_sup")[0].get_text() for i in range(num): if (result == nick_sup[i]): print(i) return ("刚刚" + result + "聚聚 " + money + "!" + "在聚聚榜上排名第" + str(i + 1) + "位!" + "\n" + "现在共有" + str(people) + "个人参与了活动。" + "\n" + "距离今日集资目标【¥" + 一张大盘 + "】还差【¥" + str(sub) + "】\n" + "在最后的这段日子里,我们再坚持一下!" + "\n" + "wds链接:http://jli.li/I") else: return ("刚刚" + result + "聚聚 " + money + "!" + "\n" + "现在共有" + str(people) + "个人参与了活动。" + "\n" + "距离今日集资目标【" + "一张大盘" + "】还差【¥" + str(sub) + "】\n" + "在最后的这段日子里,我们再坚持一下!" + "\n" + "wds链接:http://jli.li/I")
def make_soup_request(self, url, *args, soup_parser=default_soup_parser, **kwargs): """Makes a request to a given url & converts the response text to a `bs4.BeautifulSoup` instance. WARNING: request status codes should be considered to not automatically be checked so it is highly suggested that you explicitly pass the kwarg check_status_code with a truthy value to ensure it is checked. See also: help(self.make_request) Parameters ---------- soup_parser explicit parser to be used by the `BeautifulSoup` instance """ return BS4( self.make_request(url, *args, **kwargs).text, soup_parser)
def property_info(address, zipcode): url = gen_url(address, zipcode) spider = Crawler() html = spider.html(url) if html: try: soup = BS4(html) dt = soup.find("dt", class_="property-data") info = dt.text.strip() span = soup.find("span", itemprop="addressLocality") city = span.text.strip() span = soup.find("span", itemprop="addressRegion") state = span.text.strip() return address, city, state, zipcode, info except: log.write( "Failed to analysis address = %s, zipcode = %s" % (address, zipcode), "Failed Extraction") return None else: log.write("%s Failed to get http request" % url, "Http Error")
async def query_take_first_result(query): print('Requesting page to anibin...') async with aiohttp.ClientSession() as sesi: async with sesi.get('http://anibin.blogspot.com/search?q={}'.format( query)) as resp: response = await resp.text() # Let's fiddle with the data soup_data = BS4(response, 'html.parser') first_query = soup_data.find('div', attrs={'class': 'date-posts'}) if not first_query: return None, None, None # Query results query_title = first_query.find('h3', attrs={ 'class': 'post-title entry-title' }).text.strip() if not query_title: return None, None, None content_data = str( first_query.find('div', attrs={'class': 'post-body entry-content'})) n_from = content_data.find('評価:') if n_from == -1: return False, False, False nat_res = content_data[n_from + 3:] nat_res = nat_res[:nat_res.find('<br/>')] n_from2 = content_data.find('制作:') if n_from2 == -1: return [query_title, nat_res, 'Unknown'] studio = content_data[n_from2 + 3:] studio = studio[:studio.find('<br/>')] return [query_title, nat_res, studio]
def shikimori_one_scanner(url): # done 20190625 # time.sleep(3) #looks like need 2-3 sec pause, or server not response # dt = datetime.datetime.today().strftime('%Y-%m-%d') timeout = 5 r = urllib.request.Request( url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0' }) # r = urllib.request.Request(url, data='cmd=date +%Y%m%d',headers={'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'}) html = gethtml2(r) if html: pass else: r = urllib.request.Request( url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0' }) # r = urllib.request.Request(url, data='cmd=date +%Y%m%d',headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0'}) html = gethtml2(r) soup = BS4(html, "html5lib") htmltext = str(soup) webready = -1 #error etc print("Эпизоды:" in htmltext) episodes = htmltext.split("Эпизоды:", 1)[1] episodes = episodes.split("class=\"value\">", 1)[1] episodes = episodes.split("<", 1)[0].split("/")[0] print("эпизоды === ", episodes, flush=True) if episodes: webready = safenumber(episodes) or webready fullSeason = soup.find("span", {"data-text": "вышло"}) or False if fullSeason: webready = 0 # keep it for full season released etc, for shikimori.one only print(webready) return 0, 0, 0, 0, webready
def details(self, response): details_html = Selector(response=response).extract() detail = BS4(details_html, 'lxml') details_data = self.make_details_data(detail) price = details_data.get('price') glod = details_data.get('glod') if price != 0: unit_price = glod / price else: unit_price = 0 print(glod, price, response.url) item_obj = WowGlodPriceItem( price=price, glod=glod, unit_price=unit_price, area=details_data.get('area'), server=details_data.get('server'), camp=details_data.get('camp'), push_timestrap=details_data.get('push_timestrap'), order_id=details_data.get('order_id'), url=response.url) yield item_obj
def download_proxy(self, maximum_num_of_proxy=10): """ [EN]load latest availble proxy from www.us-proxy.org There are 3 levels of proxies according to their anonymity. Level 1 - Elite Proxy / Highly Anonymous Proxy: The web server can't detect whether you are using a proxy. Level 2 - Anonymous Proxy: The web server can know you are using a proxy, but it can't know your real IP. Level 3 - Transparent Proxy: The web server can know you are using a proxy and it can also know your real IP. [CN]从www.us-proxy.org上抓取我们需要的代理 在=== EDIT THE FOLLOWING RULES CAN FILTER THE PROXY YOU WANT 一行下可以修改规则,确定你所需要的 代理。默认只使用Elite proxy """ ### get www.us-proxy.org homepage html spider = Crawler() html = spider.html("http://www.us-proxy.org/") ### analyze the html, save useful proxy. ips = list() res = list() soup = BS4(html) table = soup.find("table", id="proxylisttable") for tr in table.tbody.find_all("tr"): ip, port, code, country, anonymity, google, https, last_check = [ td.text for td in tr.find_all("td") ] ### === EDIT THE FOLLOWING RULES CAN FILTER THE PROXY YOU WANT if anonymity == "elite proxy": # default only use elite proxy ips.append("http://%s:%s" % (ip, port)) res.append([0.0, 0.0, 1.0]) if len( res ) >= maximum_num_of_proxy: # if got enough useful proxy, then step out break self.proxy = pd.DataFrame(res, index=ips, columns=["success", "tried", "health"])
def get_bulkcurrencies(self): """ Get the supported currencies Scraped from a JSON object on the html page in javascript tag """ start = r'YAHOO\.Finance\.CurrencyConverter\.addCurrencies\(' _json = r'\[[^\]]*\]' try: resp = get(self.currencies_url) resp.raise_for_status() except exceptions.RequestException as e: self.warn("%s: Problem whilst contacting endpoint:\n%s" % (self._name, e)) else: # Find the javascript that contains the json object soup = BS4(resp.text, 'html.parser') re_start = re.compile(start) jscript = soup.find('script', type='text/javascript', text=re_start).string # Separate the json object re_json = re.compile(_json) match = re_json.search(jscript) if match: json_str = match.group(0) with open(self._cached_currency_file, 'w') as fd: fd.write(json_str.encode('utf-8')) # Parse the json file with open(self._cached_currency_file, 'r') as fd: j = json.load(fd) if not j: raise RuntimeError( "JSON not found at endpoint or as cached file:\n%s" % self._cached_currency_file) return j
def find_dns_records(self, domain, record_type='A', dataframe=True): """ find all dns records of a given domain :param domain: a typical domain name, e.g. "example.com" :returns: a dict of (hostname -> DNSRecord) """ html = self.session.get(self.zonefile_url.format(domain=domain)).text if record_type == 'A': #Update the security token while we're at it. sec_pattern = 'nonce=\"([0-9A-Za-z]+)\"' self.sec = re.compile(sec_pattern).findall(html)[0] pattern = "Undo{rt}Edit\\('tbl{rt}Records_([0-9]+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)'\\)".format( rt=record_type) try: results = map(DNSRecord._make, re.compile(pattern).findall(html)) except: logger.exception( 'find domains broken, maybe godaddy has changed its web structure' ) return [] return results else: # available records: ['A','CNAME','MX','TXT','SRV','AAAA','NS'] try: assert ['CNAME'].count(record_type.upper()) except: logger.exception( 'package development incomplete. currently not handling "%s" records' % record_type) return [] h = BS4(BS4(html).encode('ascii')) r_table = h.find_all('table', id=re.compile('tbl%sRecords' % record_type.upper()))[0] tbl_headers = r_table.tr.text tbl_headers = re.sub(r'[^\u0000-\u007F\s]+', '', tbl_headers) tbl_headers = re.sub(r'[\n]+', '\n', tbl_headers).strip().split('\n') tbl_headers = [it.strip() for it in tbl_headers if it.strip()] df = pd_dataframe(columns=tbl_headers) tbl_items = r_table.find_all('tr', attrs={'lstindex': True}) for row in tbl_items: cols = row.find_all('input', attrs={"type": "hidden" })[1:] #skipping checkmark col new_vals = [v.attrs['value'] for v in cols] df = df.append(dict(zip(tbl_headers, new_vals)), ignore_index=True) df.index += 1 #adjusting index to coincide with godaddy index if dataframe == True: return df else: DNSRecord = namedtuple( 'DNSRecord', 'index, hostname, value, ttl, host_td, points_to, rec_modified' ) pattern = "\\('tbl{rt}Records_([0-9]+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)', '([^']+)'\\)".format( rt=record_type) results = map(DNSRecord._make, re.compile(pattern).findall(html))
print('해당 주차 강의를 다운로드 합니다. 완료될 때까지 기다려주세요.') print('- 온라인 출석부에서 해당 동영상 강의 열람 횟수가 1회 증가합니다.') vod_list = week.find_all('li', {'class': 'activity vod modtype_vod'}) for i in range(0, len(vod_list)): vodLink = re.search(r'https://.*\d*', str(vod_list[i])).group() vodLink = vodLink[:vodLink.find('"')] vodLink = vodLink.replace('view', 'viewer') if (vodLink.find( 'https://plato.pusan.ac.kr/mod/vod/viewer.php?') == -1 ): continue driver.get(vodLink) try: da = Alert(driver) da.dismiss() except: print("", end='') html = driver.page_source soup = BS4(html, 'html.parser') source = str(soup.find_all('source')) source = source[source.find('https'):source.find('m3u8') + 4] fileDownload( courseName + week.attrs['aria-label'] + '_' + str(i + 1), source) driver.get('https://plato.pusan.ac.kr/') driver.find_element_by_xpath( '//*[@id="page-header"]/div[1]/div[2]/ul/li[2]/a').click() driver.quit() exit()
browser.execute_script( "window.scrollTo(0, document.body.scrollHeight);") # Wait to load page time.sleep(0.5) # Calculate new scroll height and compare with last scroll height new_height = browser.execute_script( "return document.body.scrollHeight") if new_height == last_height: break last_height = new_height #preparing the soup search = browser.page_source soup = BS4(search, "html.parser") #searching all the span attribute to the actor name class people = soup.findAll( "span", { "class": "entity-result__title-line flex-shrink-1 entity-result__title-text--black" }) for name in people: name_list.append(name.text) for name in people: title_list.append(company) clean_list = []