def extract_packt_free_book(content, encoding='utf-8'): if hasattr(content, 'read'): # file-type content = content.read() if isinstance(content, bytes): # convert to str content = content.decode(encoding) # Extracting information with html parser page = BeautifulSoup(content, 'html.parser') dealoftheday = page.select_one( '#deal-of-the-day div div div:nth-of-type(2)') if not dealoftheday: return None book = util.AttributeDict() try: book['name'] = dealoftheday.select_one( 'div:nth-of-type(2) h2').text.strip() book['summary'] = dealoftheday.select_one( 'div:nth-of-type(3)').text.strip() book['expires'] = int(dealoftheday.select_one( 'span.packt-js-countdown').attrs['data-countdown-to'] ) image_source = page.select_one( '#deal-of-the-day > div > div > ' 'div.dotd-main-book-image.float-left > a > img' ).attrs.get('data-original', None) if image_source and image_source.startswith('//'): image_source = 'https:{0}'.format(image_source) book['cover'] = image_source return book except: return None
def article_crawler(url): """ Crawls article url, and extract fields args: url <str>: article url return: article_dict <dict>: artilce dict with fields """ resp = requests.get(url) soup = BeautifulSoup(resp.text, 'html5lib') article = {} article['title'] = soup.select_one('#h1').text.replace('\u3000',' ').strip() article['dt'] = soup.select_one('div.gggs > time').text.strip() article['content'] = soup.select_one('#summary').text.strip() if soup.select_one('div.urcc > a.function_icon.clicked'): article['view_count'] = int(re.findall('\d+', soup.select_one('div.urcc > a.function_icon.clicked').text)[0]) else: article['view_count'] = 0 writer.writerow(article) f.flush() # Flush here so we don't loose data on exception return article
def parse_object_content(content): obj = {} soup = BeautifulSoup(content, "html.parser") obj["div"] = _strip(soup.select_one("div").text) obj["select"] = _split(soup.select_one("select > option").text) print obj return obj
def GetDetail(self,html): soup = BeautifulSoup(html,'html5lib') dt = dict() timelimit = soup.select_one('#conditions > div:nth-of-type(1)').text memorylimit = soup.select_one('#conditions > div:nth-of-type(2)').text L = soup.find_all(class_='content-wrapper') Title = ['description','input','output','sampleinput','sampleoutput','hint'] for i in range(len(L)): try: dt[Title[i]] = L[i] except Exception as e: print(e) dt['specialjudge'] = soup.select_one('#spjinfo') if dt['specialjudge'] is None : dt['specialjudge'] = 0 else: dt['specialjudge'] = 1 s1 = 'Time Limit: ' s2 = 'Memory Limit: ' dt['timelimit'] = timelimit[timelimit.find(s1)+len(s1):] dt['memorylimit'] = memorylimit[memorylimit.find(s2)+len(s2):] return dt
def get_course_classes(self, kcdm): """ 获取选课系统中课程的可选教学班级 @structure {'可选班级': [{'起止周': str, '考核类型': str, '教学班附加信息': str, '课程容量': int, '选中人数': int, '教学班号': str, '禁选专业': str, '教师': [str], '校区': str, '优选范围': [str], '开课时间,开课地点': [str]}], '课程代码': str, '课程名称': str} :param kcdm: 课程代码 """ params = {'kcdm': kcdm.upper()} method = 'get' # url = 'student/asp/select_topRight.asp' url = 'student/asp/select_topRight_f3.asp' response = self.api_request(method, url, params) page = response.text ss = SoupStrainer('body') bs = BeautifulSoup(page, self.html_parser, parse_only=ss) class_table = bs.select_one('#JXBTable') if class_table.get_text(strip=True) == '对不起!该课程没有可被选的教学班。': return APIResult(None, response) result = dict() _, result['课程代码'], result['课程名称'] = bs.select_one('#KcdmTable').stripped_strings result['课程代码'] = result['课程代码'].upper() trs = class_table.find_all('tr') course_classes = [] for tr in trs: tds = tr.find_all('td') assert len(tds) == 5 # 解析隐含在 alt 属性中的信息 class_info_table = BeautifulSoup(tds[1]['alt'], self.html_parser) info_trs = class_info_table.select('tr') # 校区 起止周 考核类型 禁选专业 cls_info = dict(safe_zip(info_trs[0].stripped_strings, parse_tr_strs([info_trs[1]])[0])) # 选中人数 课程容量 for s in info_trs[2].stripped_strings: kv = [v.strip() for v in s.split(':', 1)] cls_info[kv[0]] = int(kv[1]) if kv[1] else None # 教学班附加信息 # 教学班附加信息:屯溪路校区 上课地点:体育部办公楼2楼 cls_info.update([(v.strip() or None for v in s.split(':', 1)) for s in info_trs[5].stripped_strings]) # 开课时间,开课地点 p = re.compile(r'周[一二三四五六日]:\(\d+-\d+节\) \(\d+-\d+周\).+?\d+') cls_info[info_trs[3].get_text(strip=True)] = p.findall(info_trs[4].get_text(strip=True)) cls_info['课程容量'] = int(cls_info['课程容量']) cls_info['选中人数'] = int(cls_info['选中人数']) cls_info['教学班号'] = tds[1].string.strip() cls_info['教师'] = [s.strip() for s in tds[2].text.split(',')] cls_info['优选范围'] = [s.strip() for s in tds[3].text.split(',')] course_classes.append(cls_info) result['可选班级'] = course_classes return APIResult(result, response)
def extract_details(self): for item in self.get_detail_workers(): soup = BeautifulSoup(item.worker.result(), 'lxml') if soup.select_one('div.article__heading__title') is None: continue self.detail_candidates.append({'title': soup.select_one('div.article__heading__title').text, 'content': soup.select_one('div.node-article-content').text}) return self.detail_candidates
def __init__(self, url): html = urlopen(url) bs_html = BeautifulSoup(html.read(), "html.parser") self.title = bs_html.select_one("#title > div.left > h1").get_text(strip=True) self.publisher = bs_html.select_one("#title > div.left > h2").get_text(strip=True) self.category = bs_html.select_one("#left-stack > div.lockup.product.course.itunes-u > ul > li.genre > a > span").get_text(strip=True) self.rating = bs_html.select_one("#left-stack > div.extra-list.customer-ratings > div")["aria-label"] bs_video_trs = bs_html.find_all("tr",attrs={"class":"podcast-episode video"}) if bs_video_trs is not None: self.video_urls = [bs_video_tr["video-preview-url"] for bs_video_tr in bs_video_trs]
def parseBook(url,book): '''解析当当商品页面内的标题(书名)和价格''' response=requests.get(url).text soup = BeautifulSoup(response, "lxml") title=soup.select_one('.name_info > h1').get('title') price_str=soup.select_one('.price_qiang .price_d').getText() pattern=re.compile("\d+\.?\d*") price=pattern.search(price_str).group(0) book.append({'title':title,'price':price})
def login(self): #Get sessionid r=self.session.get('https://cas.xjtu.edu.cn/login') br=BeautifulSoup(r.content, "html.parser") lt=br.select_one('input[name="lt"]')['value'] exe=br.select_one('input[name="execution"]')['value'] #Auth headers = { 'Content-Type': 'application/x-www-form-urlencoded' } data = 'username='******'&password='******'&code=<='+lt+'&execution='+exe+'&_eventId=submit&submit=%E7%99%BB%E5%BD%95' self.session.post('https://cas.xjtu.edu.cn/login', headers=headers, data=data)
def parse_listing_page(self, response): """Extract basic info from listing page, returns an Item.""" body = BeautifulSoup(response.body) meta = response.meta item = DubizzleItem() item['title'] = body.select_one(LISTING_SELECTORS['title']).get_text(strip=True) item['date'] = body.select_one(LISTING_SELECTORS['date']).get_text(strip=True) item['make'] = meta['make'].title() item['model'] = meta['model'].title() yield item
def extract_details(self): for item in self.get_detail_workers(): passage_id = item.url[-14:-5] soup = BeautifulSoup(item.worker.result(), 'lxml') comments = self.submit_job(self.comment_url.format(passage_id[-3:], passage_id))\ .worker.result().decode('utf-8') comments = json.loads(comments[comments.index('{'):comments.rindex('}') + 1]) user_comments = [] for comment in comments['list']: user_comments.append(comment['content']) self.detail_candidates.append({'title': soup.select_one('h1').text, 'content': soup.select_one('div.textbox').text, 'comments': user_comments}) return self.detail_candidates
def get_character_data(self, lodestone_id, achievements=False): data = request.urlopen('{0}/character/{1}'.format(self.lodestoneUrl, lodestone_id)) page = BeautifulSoup(data, LodestoneClient.HTML_PARSER) # noinspection PyDictCreation character = {} character['lodestone_id'] = lodestone_id character['name'] = page.select_one(self.config['selector.character.name']).text character['server'] = page.select_one(self.config['selector.character.server']).text.strip() \ .replace('(', '').replace(')', '') character['free_company'] = \ page.select(self.config['selector.character.free_company'])[0].attrs['href'].split('/')[3] character['lodestone_profile'] = page.select_one(self.config['selector.character.profile']).text.strip() tmp = page.select_one(self.config['selector.character.race']).text.strip().split(' / ') character['race'] = tmp[0] character['gender'] = 'Male' if tmp[2] == '♂' else 'Female' tmp = page.select(self.config['selector.character.main']) character['nameday'] = tmp[0].text character['guardian'] = tmp[1].text character['city_state'] = tmp[2].text try: character['grand_company'] = tmp[3].text.split('/')[0] character['grand_company_rank'] = tmp[3].text.split('/')[1] except IndexError: # User has no grand company character['grand_company'] = None character['grand_company_rank'] = None character['mounts'] = [node.attrs['title'] for node in page.select(self.config['selector.character.mounts'])[0].findAll('a') if 'title' in node.attrs] character['minions'] = [node.attrs['title'] for node in page.select(self.config['selector.character.mounts'])[1].findAll('a') if 'title' in node.attrs] character['classes'] = [] tmp = [] for d in page.select(self.config['selector.character.classes']): if not d.text: continue tmp.append(d.text) for i in range(0, len(tmp), 3): if '-' in tmp[i + 1]: continue exp = tmp[i + 2].split(' / ') character['classes'].append({ 'name': tmp[i], 'level': int(tmp[i + 1]), 'current_exp': int(exp[0]), 'next_exp': int(exp[1]) }) if achievements: character['recent_achievements'] = self.get_character_achievements(lodestone_id) return character
def process_file(path): f = open(path, 'r') html_doc1 = f.read() soup = BeautifulSoup(html_doc1, 'xml') # only handles single epsilon schedule data = {"epsilon_schedule": element_to_array(soup.select_one("epsilon").select_one("e1")), "particles": element_to_array(soup.select_one("particles")), "times": element_to_array(soup.select_one("times")), "measurements": process_measurements(soup), "models": process_models(soup)} #print data #print "\n\n\n" print json.dumps(data)
def parse_full_item(self, html, parsed_data=None): """ Parse page with full details. :params source_html: HTML source to be parsed. :return: A dictionary with the details extracted. """ soup = BeautifulSoup(html, 'lxml') parsed_data = parsed_data or {} data = { 'sold': False, 'ad_last_modified': self.get_last_modified(soup), 'registration_expiry': self.get_registration_expiry(soup), 'registration_plate': self._get_detail( soup, 'Registration Plate'), 'doors': self.get_doors(soup), 'body_type': self._get_key_feature(soup, 'BODY TYPE'), 'transmission_type': self._get_key_feature(soup, 'TRANSMISSION'), } if not parsed_data.get('ad_title', None): data['ad_title'] = soup.select_one('title').get_text() if not parsed_data.get('odometer', None): data['odometer'] = self.extract_number( self._get_detail(soup, 'Kilometres') ) data.update(self.extract_model(soup)) data['series'] = self.get_series(soup, parsed_data.get('year')) return self.reset_optional(data)
def Analyse(self, html): soup = BeautifulSoup(html, 'html5lib') L = list() for i in range(2, 30): td = soup.select_one('body > table.a > tbody > tr:nth-of-type({})'.format(i)) if td is None: break dt = dict() dt['originOJ'] = 'PKU' titles = ['realrunid', 'nickname', 'originProb', 'status', 'runmemory', 'runtime', 'language', 'codelenth', 'realsubmittime'] for con in td.contents: dt[titles[0]] = con.text if titles[0] == 'codelenth': dt[titles[0]] = dt[titles[0]][:-1] titles = titles[1:] L.append(dt) return L
def main(): resp = requests.get('https://github.com/login/') if resp.status_code != 200: return cookies = resp.cookies.get_dict() # 获取cookie soup = BeautifulSoup(resp.text, 'lxml') utf8_value = soup.select_one('form input[name="utf8"]').attrs['value'] # 获取隐藏域内容 authenticity_token_value = soup.select_one('form input[name="authenticity_token"]').attrs['value'] # 获取token data = { 'utf8': utf8_value, 'authenticity_token': authenticity_token_value, 'login':'******', 'password': '******' } resp = requests.post('https://github.com/session/', data=data, cookies=cookies) # 利用post请求提交 print(resp.text)
def get(self, response, page): channel = { 'page': page, 'page_patten': None, 'movies': [] } soup = BeautifulSoup(response, "html.parser") # get total page last_page = soup.select_one('div.ah-pagenavi > ul.pagination > li.last') print("*********************** Get pages ") if last_page is not None: page = last_page.text.strip() channel['page'] = int(page) for movie in soup.select('div.ah-row-film > div.ah-col-film > div.ah-pad-film > a'): title = movie.select_one('span.name-film').find(text=True, recursive=False).strip() type = movie.select_one('span.number-ep-film').text.strip() label = "[%s] %s" % (type, title) thumb = movie.select_one('img').get('src') channel['movies'].append({ 'id': movie.get('href'), 'label': label.encode("utf-8"), 'title': title.encode("utf-8"), 'realtitle': title.encode("utf-8"), 'thumb': thumb, 'type': type.encode("utf-8"), }) return channel
def fetch(link_url): tree = {} content_list = [] resp = requests.get(link_url, headers=headers) data = resp.content print('content', data) soup = BeautifulSoup(data, "html5lib") elements = list(soup.select_one('#container').select_one('#content_left').children) f = open('search.txt', 'wb+') i = 0 for ele in elements: try: print('ele bs4 type', type(ele)) if isinstance(ele, Tag): i += 1 title = ele.select_one('h3').text print('!!! title', title) contents = ele.select('div') c = '' for content in set(contents[1:]): if content.text: if content.text not in content_list: c += content.text + '||' content_list.append(content.text) f.write(str(i) + '、标题: ' + title + '\n') f.write('内容: ' + c + '\n') tree[i] = (title, c) except Exception as e: print('error:', e) f.close() print('tree: ', len(tree), tree)
def _getBinsForZip( zipCode ): # We get a 403 if we don't use the User-Agent r = requests.get( 'https://satruck.org/DropOff/Index', params = { 'zip': zipCode } ) if r.status_code != 200: raise RuntimeError( '%s' % r ) print r soup = BeautifulSoup(r.text, 'html.parser') divs = soup.select_one( '#drop-off-location-list' )#.select( '.has-website' ) print divs def cleanSelectText( root, selector, default='' ): selection = root.select(selector) if len( selection ): return re.sub( '\\s+', ' ', selection[0].getText() ).strip() else: return default locations = [] for d in divs: locations.append( Location( title = cleanSelectText(d,'.drop-off-location-title'), address = cleanSelectText(d,'.drop-off-location-address')+cleanSelectText(d,'.drop-off-location-city-state-zip'), phone = cleanSelectText(d,'.drop-off-location-phone'), website = cleanSelectText(d,'.drop-off-location-website'), hours = cleanSelectText(d,'.drop-off-location-hours') ) ) return locations
def get_JD(jd_url): print 'jd_url=' + jd_url jd_data = requests.get(jd_url, verify=False, timeout=1) # SSL连接错误,需要设置verify=False soup = BeautifulSoup(jd_data.text) jd_one = soup.select_one("div.lf-border-box") if jd_one is None or len(jd_one) < 1: return None else: jd_title = jd_one.select_one("h3.bg-title").get_text().replace(" <span> </span>", "") # print jd_title jd_detail_box = jd_one.select_one("div.detail-box") jd_detail_table_td = jd_detail_box.select("table.detail-table.box-border td") jd_publish_date = jd_detail_table_td[1].get_text() jd_hc = jd_detail_table_td[11].get_text().strip() jd_detail_content = jd_detail_box.select("p.detail-content") jd_desc = jd_detail_content[0].get_text().strip() jd_request = jd_detail_content[1].get_text().strip() # jd_content = jd_desc+ jd_request jd_content_data = { 'jd_url': jd_url, 'jd_title': jd_title, 'jd_publish_date': jd_publish_date, 'jd_hc': jd_hc, 'jd_desc': jd_desc, 'jd_request': jd_request, } # print jd_content_data return jd_content_data
def _parse(self, page: BeautifulSoup, url): seasons = OrderedDict() eqg = OrderedSet() child = page.select_one("#WikiaArticle h2") season = child.text while child.next_sibling: child = child.next_sibling if child.name == "table": for a in child.find_all("a", string="Transcript"): if not a.has_attr("class") or "new" not in a["class"]: episode_url, fragment = urldefrag(a["href"]) episode_url = urljoin(url, episode_url) if "Equestria Girls" not in season: if season not in seasons: seasons[season] = OrderedSet() seasons[season].append(episode_url) else: eqg.append(episode_url) continue if child.name == "h2": season = child.text continue seasons["Equestria Girls"] = eqg return seasons
def handleFURLText(text): bs = BeautifulSoup(text, "html.parser") divs = bs.findAll('div', {"class" : "user-feed-wrapBox clearfix"}) for div in divs : pdivs = div.select('div.user-feed-wrap') for pdiv in pdivs : titleDivs = pdiv.findAll('div', attrs = {"class" : "title clearfix"}) for titleDiv in titleDivs : print titleDiv.select_one('div.title_info span').text print titleDiv.select_one('div.title_desc p').text videoNameTag = pdiv.select_one('div.mod-piclist_info p.mod-piclist_info_title a') print videoNameTag.text print videoNameTag['data-videoid'] print videoNameTag['href'] videoPlayTag = pdiv.select_one('div.mod-piclist_info p.mod-piclist_info_times span.playTimes a') if videoPlayTag is not None : videoPlayNum = videoPlayTag.text print videoPlayNum videoCommentTag = pdiv.select_one('div.mod-piclist_info p.mod-piclist_info_times span.commentTimes a') if videoCommentTag is not None : videoCommentNum = videoCommentTag.text print videoCommentNum emTag = pdiv.find('em', attrs = {"class" : "con fs14"}) if emTag is not None : print emTag.text tipsDiv = bs.select_one('div.tips-loading') if tipsDiv is not None : fURL = tipsDiv['data-loading-src'] handleFURLPageText(fURL)
def get_list_price(isbn): """Return the list price of a book. Parameters ---------- isbn : string ISBN-10 or ISBN-13 of a book. Returns ------- list_price : float List price of the book in US dollars, or None if not found. Examples -------- >>> get_list_price("9780262029445") 74.0 """ AMAZON_URL = "http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Daps&field-keywords=" # noqa search_result = requests.get(AMAZON_URL+isbn) book_soup = BeautifulSoup(search_result.content, 'lxml') price = book_soup.select_one("span.a-text-strike").get_text() if price[0] == '$': return float(price[1:]) else: return None
def fetch_srouce(url=''): """""" next = url while next: print(next) r = fetcher.get(url=next) if 200 != r.status_code: return page = BeautifulSoup(r.content, 'html.parser') _duris = page.select('tbody[id^="normalthread"] .xst') # print(_duris) uris = [_du.get('href') for _du in _duris] # print(uris) for u in uris: try: fetch_detail(u) except: with open('fetch_fail.log', 'a') as fd: fd.write(u) fd.write('\n') _next = page.select_one('.nxt') if _next: next = _next.get('href') else: next = None return
def Analyse(self, html): soup = BeautifulSoup(html, 'html5lib') L = list() for i in range(2, 30): tr = soup.select_one('#SubmissionSearchForm > table > tbody > tr:nth-of-type({})'.format(i)) if tr is None: break dt = dict() dt['originOJ'] = 'ZOJ' titles = ['realrunid', 'realsubmittime', 'status', 'originProb', 'language' , 'runtime', 'runmemory', 'nickname'] for con in tr.contents: try: dt[titles[0]] = con.text.strip() titles = titles[1:] except Exception: pass L.append(dt) return L
def parse_product(self, response): soup = BeautifulSoup(response.body, 'lxml') p = Product() for element, path in self.selectors.viewitems(): node = soup.select_one(path) if not node: continue if element == 'image': p[element] = url_fix(urljoin(response.url, node['src'])) else: p[element] = text(node) if 'name' in p and 'number' in p: p['url'] = response.url p['pricing'], p['discountcode'] = get_prices(soup) soup.decompose() yield p else: # Only follow links on non-product pages soup.decompose() for link in self.link_extractor.extract_links(response): yield Request(url=link.url)
def get_url(args): term = args.term if not args.search: if term.startswith('myanimelist.net'): term = "http://" + term if re.fullmatch(r"\d+", term): term = "http://myanimelist.net/anime/{}".format(term) if not re.fullmatch(r"http://myanimelist.net/anime/\d+/?(/.*)?", term): print("invaild url {}".format(term)) sys.exit(2) return term # search all # search_url = "http://myanimelist.net/search/all" # page = requests.get(search_url, params={"q": term}) # soup = BeautifulSoup(page.content, "html.parser") # anime_url = soup.select_one('article > div').select_one('a.hoverinfo_trigger')['href'] search_url = "http://myanimelist.net/anime.php" page = requests.get(search_url, params={"q": term}) soup = BeautifulSoup(page.content, "html.parser") anime_url = soup.select_one('a.hoverinfo_trigger')['href'] print(" ", anime_url) return anime_url
def getLocations(): r = requests.get('http://www.housingworks.org/donate/drop-off-donations/') if r.status_code != 200: raise RuntimeError( '%s' % r ) soup = BeautifulSoup(r.text, 'html.parser') results = [] locationsBlock = soup( text=re.compile(r'Drop off locations',re.IGNORECASE) )[0].parent.find_next_sibling('div') nextLocation = locationsBlock.find_next('a') while nextLocation: # Load the next location link = nextLocation.attrs['href'] page = requests.get(link) if page.status_code != 200: raise RuntimeError( '%s' % page ) # Parse the returned data and store it pageSoup = BeautifulSoup(page.text, 'html.parser') content = pageSoup.select_one('#primary') paragraphs = content.findAll('p') results.append( HousingWorksLocation( name = content.find('h2').getText(), address = content.find('h4').getText(), hours = paragraphs[0].getText(), telephone = paragraphs[1].getText().lower().replace('phone:','').strip(), offerings = paragraphs[2].getText().lower().replace('offerings:','').strip(), link = link ) ) # Find the next location nextLocation = nextLocation.find_next_sibling('a') return results
def tasnimgallery(url): try: if url.find('http://tasnimnews.com/') != 0: raise Exception('Not supported link') soup = BeautifulSoup(requests.get(url).text, 'html.parser') article = soup.select_one('body.photos article.media') images = map(lambda x: { 'link': x['href'], 'thumb': x.find('img')['src'] }, article.select('.row a')) result = { 'title': article.select_one('h1.title').text.strip(), 'reporter': article.select_one('h4.reporter').text.strip(), 'time': parsedate(article.select_one('time').text.strip()), 'lead': article.select_one('h3.lead').text.strip(), 'images': list(images), 'url': url } except Exception as e: result = {"error": str(e)} response = Response( json.dumps(result, indent=1, ensure_ascii=False), content_type='application/json;charset=utf8') # TODO: This should be limited response.headers['Access-Control-Allow-Origin'] = "*" return response
def get_audio_by_id(self, owner_id, audio_id): response = self._vk.http.get( 'https://m.vk.com/audio{}_{}'.format(owner_id, audio_id), allow_redirects=False ) bs = BeautifulSoup(response.text, 'html.parser') link = bs.select_one('.ai_body input[type=hidden]').attrs['value'] return decode_audio_url(link, self.user_id)
temp_dict['Snippet'] = message['snippet'] # fetching message snippet try: # Fetching message body mssg_parts = payld['parts'] # fetching the message parts part_one = mssg_parts[0] # fetching first element of the part part_body = part_one['body'] # fetching body of the message part_data = part_body['data'] # fetching data from the body clean_one = part_data.replace("-","+") # decoding from Base64 to UTF-8 clean_one = clean_one.replace("_","/") # decoding from Base64 to UTF-8 clean_two = base64.b64decode (bytes(clean_one, 'UTF-8')) # decoding from Base64 to UTF-8 html = part_body.read() soup = BeautifulSoup(html) table = soup.select_one("table.data2_s") headers = [th.text.encode("utf-8") for th in table.select("tr th")] # mssg_body is a readible form of message body # depending on the end user's requirements, it can be further cleaned # using regex, beautiful soup, or any other method temp_dict['Message_body'] = table except : pass print (temp_dict) final_list.append(temp_dict) # This will create a dictonary item in the final list # This will mark the messagea as read GMAIL.users().messages().modify(userId=user_id, id=m_id,body={ 'removeLabelIds': ['UNREAD']}).execute()
while halaman < limitHalaman: raws=f'{linkRaw}{hari.strftime("%d-%m-%Y")}' url=BeautifulSoup(requests.get(raws).text.encode("utf-8"),"html.parser") #mengambil isi berita di sublink dan membersihkannya dari tag HTML for i in url.select(".simple-post"): linkSemuanya=i.find ("a")['href'] file2.write(linkSemuanya+'\n') sublink=BeautifulSoup(requests.get(linkSemuanya).text.encode("utf-8"),"html.parser") #membuang googletagpush for isiScript in sublink(['script','style']): isiScript.decompose() try: isiBerita=sublink.select_one(".post-content").getText().strip().translate(str.maketrans('','',string.punctuation)) title = sublink.select_one('.post-title').getText().strip().translate(str.maketrans('','',string.punctuation)) except AttributeError: pass #memasukkan kedalam folder antaraNews file = open(f'../data/crawling/berita{halaman}.txt','w') file.write(f'{title}\n{isiBerita}') if halaman is limitHalaman: break halaman+=1 else: hari+=timedelta(days=-1) print(f'selesai {halaman} berita')
def get_total_cases(cls, response): soup = BeautifulSoup(response.content, 'html.parser') return soup.select_one(cls._lawsuits_total_count_selector).get_text().replace(',', '')
from selenium import webdriver from bs4 import BeautifulSoup from pymongo import MongoClient client = MongoClient('localhost', 27017) db = client.dbproject driver = webdriver.Chrome( 'C:/Users/NTRION/Downloads/chromedriver_win32/chromedriver.exe') spots = list(db.jeonbuklink.find({})) for spot in spots: driver.get(spot['link']) soup = BeautifulSoup(driver.page_source, 'html.parser') image = soup.select_one('meta[property="og:image"]')['content'] location = driver.find_element_by_xpath( '//*[@id="contents"]/div[2]/div[3]/div[2]/div/div/ul/li[3]/span').text description = driver.find_element_by_xpath( '//*[@id="contents"]/div[2]/div[3]/div[1]/div/div/p').text if location is not None: doc = { 'name': spot['name'], 'image': image, 'location': location, 'description': description } db.jeonbuk.insert_one(doc) driver.quit()
def build_search_indices(pages, version): page_views_statistic = {} #get_page_views_statistic() index_objects = [] wh_index_objects = [] print("Start building index") for url, endpoint in pages: if url.endswith('/'): url += 'index.html' if not url.endswith('.html'): continue title = '' content = '' page_type = 'Page' page_path = get_page_path_from_url(url) page_views = 0 if url in page_views_statistic: page_views = page_views_statistic[url] if page_path.startswith('community'): page_type = 'Community' elif page_path.startswith('docs/reference'): page_type = 'Reference' elif page_path.startswith('docs/tutorials'): page_type = 'Tutorial' if page_path.startswith("api/latest/"): page_info = get_api_page(True, page_path[4:], dist_path) for table in page_info['content']('table'): table.extract() for overload_group in page_info['content'].findAll( "div", {"class": "signature"}): overload_group.extract() breadcrumbs = page_info['content'].find( "div", {"class": "api-docs-breadcrumbs"}) title = page_info['title'] if breadcrumbs is not None: full_name_parts = list( map(lambda link: link.text, breadcrumbs.findAll("a"))) if "kotlin-stdlib" in full_name_parts: full_name_parts.remove("kotlin-stdlib") else: full_name_parts.remove("kotlin.test") title = " › ".join(full_name_parts).replace('<', '<').replace( '>', '>') breadcrumbs.extract() page_type = "Standard Library" if "jvm/stdlib" in url else "Kotlin Test" content = page_info['content'].find('article', {"role": "main"}) else: html_content = get_page_content(url) parsed = BeautifulSoup(html_content, "html.parser") if parsed.find("meta", {"http-equiv": "refresh"}): continue body_title = parsed.select_one("body[data-search-title]") if body_title: title = body_title.attrs["data-search-title"] if not title: title_node = parsed.find("title") if title_node: title = title_node.text # Our default pages content = parsed.find("div", {"class": "page-content"}) # Our modern pages if content is None: content = parsed.find("article", {"class": "page-content"}) # WebHelp pages if content is None: content = parsed.find("article", {"class": "article"}) if title and content: page_indexer = get_page_index_objects if parsed.select_one("body[data-article-props]"): page_type = "Documentation" page_indexer = get_webhelp_page_index_objects elif page_type == "Page": page_indexer = get_markdown_page_index_objects print("processing " + url + ' - ' + page_type) page_indices = page_indexer(content, url, page_path, title, page_type, page_views) index_objects += page_indices def wh(*args): return to_wh_index(version, *args) wh_index_objects += list(map(wh, page_indices.copy())) else: print('skip: ' + url + ' unknown page content in with title: ' + title) wh_index = get_wh_index() if wh_index: print("Submitting WH index objects to " + wh_index.index_name + " index") wh_index.add_objects(wh_index_objects) print("Index objects successfully built") index = get_index() print("Submitting index objects to " + index.index_name + " index") index.add_objects(index_objects)
import codecs from bs4 import BeautifulSoup from konlpy.tag import Twitter from gensim.models import word2vec fp = codecs.open('BEXX0003.txt', 'r', encoding='utf-16') soup = BeautifulSoup(fp, 'html.parser') body = soup.select_one('body > text') text = body.getText() twiter = Twitter() results = [] lines = text.split('\n') for line in lines: malist = twiter.pos(line, norm=True, stem=True) r = [] for word in malist: if not word[1] in ['josa', 'Eomi', 'Punctuation']: r.append(word[0]) rl = (' '.join(r)).strip() results.append(rl) print(rl) wakati_file = 'toji.wakati' with open(wakati_file, 'w', encoding='utf-8') as fp: fp.write('\n'.join(results)) data = word2vec.LineSentence(wakati_file) model = word2vec.Word2Vec(data, size=200, window=10, hs=1, min_count=2, sg=1) model.save('toji.model') print('ok')
def __init__(self): url_list = [ 'http://www.diabetes.or.kr/general/class/index.php?idx=1', 'http://www.diabetes.or.kr/general/class/index.php?idx=2', 'http://www.diabetes.or.kr/general/class/index.php?idx=3', 'http://www.diabetes.or.kr/general/class/index.php?idx=4', 'http://www.diabetes.or.kr/general/class/index.php?idx=5', 'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=322&idx=6', 'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=325&idx=1', 'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=324&idx=1', 'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=323&idx=1', 'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=327&idx=2', 'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=326&idx=2', 'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=30&idx=4', 'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=7&idx=5', 'http://www.diabetes.or.kr/general/class/medical.php?mode=view&number=6&idx=5', 'http://www.diabetes.or.kr/general/class/complications.php?code=complication&number=337&mode=view&idx=1', 'http://www.diabetes.or.kr/general/class/complications.php?code=complication&number=336&mode=view&idx=2', 'http://www.diabetes.or.kr/general/class/type.php', 'http://www.diabetes.or.kr/general/class/gestational.php' ] file_count = len(os.walk('./json').__next__()[2]) + 1 BASE_DIR = './json/' FILE_NAME = f'result{file_count}.json' json_arch = OrderedDict() json_arch["category"] = "당뇨병" title_list = [] sub_title_list = [] contents_list = [] for url in tqdm(url_list): req = requests.get(url) html = req.text soup = BeautifulSoup(html, 'html.parser') # 질의문 title = soup.select_one('div.cTop > span:nth-of-type(2)') if title: title_list.append(title.text) else: title_list.append("") # 질의문 all_sub_title = [] content_all = soup.select_one('body') sub_title = soup.select('div.food') if sub_title: for sub_tit in content_all.find_all('div', 'rnd_center'): all_sub_title.append(sub_tit.text) sub_title_list.append(all_sub_title) else: sub_title_list.append([]) # 응답문 if len(sub_title) > 0: tmp_contents_list = [] for idx in range(len(all_sub_title)): tmp_contents = [] next_content = sub_title[idx].find_next_sibling('div') while True: next_content = next_content.find_next_sibling() tmp_contents.append(next_content.text) if next_content.find_next_sibling( ) == next_content.find_next_sibling( 'div') or next_content.find_next_sibling( ) == next_content.find_next_sibling('table'): break tmp_contents_list.append(tmp_contents) contents_list.append(tmp_contents_list) else: tmp_contents = [] contents = soup.select('p.0') for content in contents: tmp_contents.append(content.text) contents_list.append([tmp_contents]) json_arch["title"] = title_list json_arch["sub_title"] = sub_title_list json_arch["content"] = contents_list with open(os.path.join(BASE_DIR, FILE_NAME), 'w', encoding="utf-8") as json_file: json.dump(json_arch, json_file, ensure_ascii=False, indent="\t") print("json 저장완료", "저장 경로", BASE_DIR, FILE_NAME)
from bs4 import BeautifulSoup # 분석 대상 HTML html = """ <html><body> <div id="meigen"> <h1>위키북스 도서</h1> <ul class="items"> <li>유니티 게임 이펙트 입문</li> <li>스위프트로 시작하는 아이폰 앱 개발 교과서</li> <li>모던 웹사이트 디자인의 정석</li> </ul> </div> </body></html> """ # HTML 분석 soup = BeautifulSoup(html, 'html.parser') # 필요한 부분을 CSS 쿼리로 추출하기 # 타이틀 부분 추출하기 h1 = soup.select_one("div#meigen > h1").string print("h1 = ", h1) # 목록 부분 추출 li_list = soup.select("div#meigen > ul.items > li") for li in li_list: print("li = ", li.string)
def get_hottest_article(): # 파이낸셜뉴스, YTN, SBS target_media = ["014", "052", "055"] # 타겟 언론사의 많이본 뉴스 url 뽑아내기 for target_num in target_media: # print(target_num) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } data = requests.get( f'https://news.naver.com/main/ranking/office.nhn?officeId={target_num}', headers=headers) soup = BeautifulSoup(data.text, 'html.parser') articles = soup.select( '#wrap > div.rankingnews > div.rankingnews_box._officeResult > div:nth-child(2) > ul > li' ) for article in articles: url = article.select_one('div > a')['href'] # print(url) target_url = f'https://news.naver.com{url}' # print(target_url) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36' } data = requests.get(target_url, headers=headers) soup2 = BeautifulSoup(data.text, 'html.parser') contents = soup2.select('#main_content') og_desc = soup2.select_one( 'meta[property="og:description"]')['content'] for content in contents: title = soup2.select_one('#articleTitle').text date = soup2.select_one( 'div.article_header > div.article_info > div > span.t11' ).text media = soup2.select_one( 'div.article_header > div.press_logo > a > img')['alt'] # date의 년월일만 가져와서 datetime 형식으로 변환. target = date[:10] date_time_obj = datetime.datetime.strptime(target, '%Y.%m.%d') # date의 오전오후를 am.pm으로 바꾸기,0 추가하기 if date[12:14] == '오전': date1 = date[:12] + 'AM' + date[14:] else: date1 = date[:12] + 'PM' + date[14:] if date[16] == ':': date1 = date1[:15] + '0' + date1[15:] # date 문자열을 datetime 형식으로 변환. date_time_obj2 = datetime.datetime.strptime( date1, '%Y.%m.%d. %p %I:%M') # 기사를 겹치지 않고 가지고 오기 위한 유니크 키 생성. parts = urlparse(target_url) query_string = parse_qs(parts.query) sid1 = query_string["sid1"][0] oid = query_string["oid"][0] aid = query_string["aid"][0] unique_key = f"{sid1}-{oid}-{aid}" # url주소가 네이버뉴스홈이면 korea 아이콘으로 db에 같이 저장하기. if target_url[:27] == "https://news.naver.com/main": doc = { 'icon': "../static/south-korea.png", 'unique_key': unique_key, 'url': url, 'title': title, 'desc': og_desc, 'date': date, 'datetime': date_time_obj2, 'datetime_server': date_time_obj, 'media': media, } # db.hottestNews.drop() # unique_key 값이 중복되지 않을때 db에 insert 하기. # Tutor : document가 있는지 찾기 document = db.hottestNews.find_one( {"unique_key": unique_key}) # # # Tutor : document가 없다면 추가하기 if document is None: db.hottestNews.insert_one(doc)
''' Created on 2018. 5. 12. @author: Administrator ''' import codecs from bs4 import BeautifulSoup from konlpy.tag import Twitter from gensim.models import word2vec # utf-16 인코딩으로 파일을 열고 글자를 출력하기 --- (※1) fp = codecs.open(filename='BEXX0003.txt', mode='r', encoding='utf-8') soup = BeautifulSoup(fp, 'html.parser') body = soup.select_one('body > text') # 여러 개의 태그 중에서 첫 번째만 선택한다.(카페 62) text = body.getText() # print( text ) # 텍스트를 한 줄씩 처리하기 --- (※2) twitter = Twitter() results = [] lines = text.split('\r\n') # print( lines ) for line in lines: # 형태소 분석하기 --- (※3) # 단어의 기본형 사용 malist = twitter.pos(line, norm=True, stem=True) # print( malist ) r = [] for word in malist: # 어미/조사/구두점 등은 대상에서 제외 if not word[1] in ["Josa", "Eomi", "Punctuation"]: r.append(word[0]) rl = (" ".join(r)).strip() results.append(rl) # print(rl) # 파일로 출력하기 --- (※4)
import requests from bs4 import BeautifulSoup # 1. 원하는 주소로 요청을 보내 응답을 저장한다. html = requests.get('https://finance.naver.com/sise/').text # 2. 정보를 조작하기 편하게 바꾸고(정제) print(html) soup = BeautifulSoup(html, 'html.parser') # 3. 바꾼 정보 중 원하는 것만 뽑아서 print(soup) kospi = soup.select_one('#KOSPI_now').text # 4. 출력한다. print(kospi)
import time options = webdriver.ChromeOptions() options.add_experimental_option('excludeSwitches', ['enable-logging']) #selenium 메시지 팝업 제거 browser = webdriver.Chrome(executable_path="./chromedriver.exe", options=options) browser.get('https://datalab.naver.com/keyword/realtimeList.naver?where=main') time.sleep(1) html = browser.page_source soup = BeautifulSoup(html, 'html.parser') ranking = soup.select( '#content > div > div.selection_area > div.selection_content > div.field_list > div > div > ul > li > div > span.item_title_wrap > span.item_title' ) day = soup.select_one( '#content > div > div.selection_area > div.selection_header > div:nth-child(1) > div > div > div > div.date_indo > a.date_box._date_trigger > span.date_txt._title_ymd' ) tiktok = soup.select_one( '#content > div > div.selection_area > div.selection_header > div:nth-child(1) > div > div > div > div.time_indo > a.time_box._time_trigger > span.time_txt._title_hms' ) print(day.text, tiktok.text, '기준 실시간 급상승 검색어') n = 0 for i in ranking: n += 1 print(n, i.text) browser.quit()
# go back to initial page driver.execute_script("window.history.go(-1)") #start new line for new rider profile f.write("\n") except: print('FAIL: 404 go back') # go back to initial page driver.execute_script("window.history.go(-1)") table_soup = BeautifulSoup(driver.page_source, 'html.parser', from_encoding='utf8') # find url in table url = rider_link.strip('http://www.worldsnowboarding.org/') find_link = table_soup.select_one("a[href*='" + url + "']") # find parent of url - this is the row that has all the rider info parent = find_link.find_parent('tr', attrs={'class': 'ranking'}) stat_array = parent.find_all('td') profile[1] = int(stat_array[0].span.text.strip('.')) #position name = stat_array[3].a.text.split(',') first_name = name[1] last_name = name[0] profile[0] = str(first_name + last_name) #name profile[5] = stat_array[4].span.text #nationality if stat_array[5] is not None or len(stat_array[5]) > 0: profile[4] = stat_array[5].text #age profile[2] = float(stat_array[8].text) #points profile_str = ', '.join(str(x) for x in profile)
def calcSUMS(params): global sum_distance, sum_elevation_gain, sum_descent, sum_refreshment_points, sum_time_limit url = "https://itra.run/calend.php?mode=getEvt&id={}&annee={}&idc={}&idx={}".format( *params) session = requests.Session() session.max_redirects = 9999999 dct = {} # url = url.strip('"') page = session.get(url, headers=headers, verify=False) soup = BeautifulSoup(page.content, "html.parser") All_info = soup.select("div#calevt_fich tr") #get_text(strip=True) lst_1 = [] dct_2 = {} for info in All_info: tds = info.select("td") if len(tds) == 2: dct_2[tds[0].get_text().strip()] = tds[1].get_text().strip() elif len(tds) == 1: if tds[0].get_text().strip() != "": if tds[0].select_one("a", href=True) is not None: registr_url = tds[0].select_one("a", href=True)["href"] else: lst_1.append(tds[0].get_text().strip()) try: distance = re.sub('[(){}<>]', '', dct_2["Distance"]).split()[0].strip() except: distance = "" try: elevation_gain = dct_2["Ascent"].strip() except: elevation_gain = "" try: descent = dct_2["Descent"].strip() except: descent = "" try: refreshment_points = dct_2["Refreshment points"].strip() except: refreshment_points = "" try: time_limit = dct_2["Maximum time"].strip() except: time_limit = "" sum_distance += reprDist(distance)[0] sum_elevation_gain += reprDist(elevation_gain)[0] sum_descent += reprDist(descent)[0] sum_refreshment_points += reprDist(refreshment_points)[0] sum_time_limit += get_sec(time_limit) th = soup.select_one("div#calevt_fich tr th", onclick=True) if th is not None: try: stage = th.select_one("a.rightarr", onclick=True) params = eval(stage['onclick'].split(";")[0]) calcSUMS(params) except: return 1
def extract_review(self, parsed_claim_review_page: BeautifulSoup) -> str: return parsed_claim_review_page.select_one( "section.l-section.wpb_row.height_small div[itemprop=\"text\"]" ).text
def job_page(url, _id): res = retry(url) soup = BeautifulSoup(res.content, 'lxml') #=== Extract Static JobBKK ===# try: static_detail_list = [_.text.strip() for _ in soup.select("div.statis-detail")] except Exception as e: print("Step job_static", e) #=== Extract Interesting ===# try: applicants = soup.select_one("#loadnumapply").text.strip() except Exception as e: print("Step job_interesting", e) #=== Extract Info ===# try: info = soup.select_one("div.row-left") detail_list = [_.text.strip() for _ in info.select_one( "div.job-detail.border-b").select("span")] skill_list = [_.text.strip() for _ in info.select_one( "div[itemprop=skills]").select("span")] incentives_detail_list = [_.text.strip() for _ in info.select_one( "div[itemprop=incentives]").select("li")] incentives_additional = info.select_one( "div[itemprop=incentives] div").text.strip() except Exception as e: print("Step job_info", e) #=== Extract Transport ===# try: jobLocation = info.select_one("div[itemprop=jobLocation]") transport_detail_list = [_.text.strip().replace( 'ไม่มี', '') for _ in jobLocation.select("div.transport-detail")] # transport_additional = jobLocation.select_one("div.transport-additional span").text.strip() except Exception as e: print("Step job_transport", e) #=== Extract Json ===# try: data_dict = json.loads(soup.find_all( 'script', {"type": "application/ld+json"})[1].text, strict=False) job_title = data_dict['title'] description = data_dict['description'] company = data_dict['hiringOrganization']['name'] job_com_id = data_dict['hiringOrganization']['sameAs'] job_com_id = re.search('\d+/\d+', job_com_id).group(0).split("/") date_post = data_dict['datePosted'] except Exception as e: print("Step job_json_data", e) #=== Extract Main Info ===# if re.search('-', skill_list[2]) != None: edu_clean = skill_list[2].replace(' ', '').split('-') edu_clean = edu_dict[edu_clean[0]] + \ '-' + edu_dict[edu_clean[1]] else: try: edu_clean = edu_dict[edu_clean] except KeyError: print("Step KeyError: ", edu_clean) edu_clean = "" try: job_dict = OrderedDict({ 'occupation_id': _id, 'job_id': int(job_com_id[1]), 'job_title': job_title, 'job_description': description.replace('\n', '|'), 'num_position': int(detail_list[0].replace('ตำแหน่ง', '').replace('ไม่ระบุ', '').replace('ไม่จำกัด', 'Inf').strip()), 'job_type': detail_list[1], 'company_id': int(job_com_id[0]), 'company_name': company, 'company_location': { #=== Location Company ===# 'street_address': data_dict['jobLocation']['address']['streetAddress'], 'local_address': data_dict['jobLocation']['address']['addressLocality'], 'region_address': data_dict['jobLocation']['address']['addressRegion'], 'postal_code': data_dict['jobLocation']['address']['postalCode'], 'country_address': data_dict['jobLocation']['address']['addressCountry'] }, 'work_location': detail_list[2].split(','), 'salary': detail_list[3].replace(',', '').replace(' ', ''), 'vacation': detail_list[5].replace('ไม่ระบุ', ''), 'work_time': detail_list[4].replace('ไม่ระบุ', ''), 'gender': skill_list[0].replace(' ', '').replace('ชาย', 'M').replace('หญิง', 'F').replace(',', ''), 'age': skill_list[1].replace('ปีขึ้นไป', '+').replace('ทุกช่วงอายุ', '').replace(' ', ''), 'edu': edu_clean.strip(), 'exp': skill_list[3].replace('ปีขึ้นไป', '+').replace(' ', ''), 'other': skill_list[4].replace('ไม่ระบุ', ''), 'incentives': incentives_detail_list, 'incentives_add': incentives_additional, 'transport': { 'bus': transport_detail_list[0], 'bts': transport_detail_list[1], 'mrt': transport_detail_list[2], 'arl': transport_detail_list[3] }, 'applicants': int(applicants), 'job_active': static_detail_list[1], 'job_view': int(static_detail_list[0].replace(',', '')), 'job_date_post': date_post, }) except Exception as e: print("Step job_dict", e) # try: # col_bkk_job.insert_one(job_dict) # except Exception as e: # print('db', e) return job_dict
class Plugin(): # registra o plugin e qual o padrão de url ele deve ser chamado # o primeiro parametro é o nome do arquivo # o segundo a o padrão em regex da URL def register_plugin(self, PluginManager): PluginManager.register_plugin('jornaldebrasilia_com_br', r"^https?://www.jornaldebrasilia.com.br/" "(cidades|brasil|futebol|mundo|economia|politica-poder|politica-e-poder)/") # returna um dicionario contendo três chaves # subtitle # date_published em formato datetime object # content # o content é o único obrigatório def extract_metadata(self, url): self.url = url self.page = None self.bs = None r = requests.get(self.url) if r.status_code == 200: self.page = r.text self.bs = BeautifulSoup(self.page, 'html.parser') subtitle = self._get_subtitle() date_published = self._get_published_date() self._remove_elements() content = self._get_content() metadata = dict(subtitle=subtitle, date_published=date_published, content=content) return metadata else: return None # remove elementos indesejados da pagina, titulos no meio # da materia etc... se precisar. def _remove_elements(self): for div in self.bs.find_all(['style']): div.decompose() # localiza o subtitulo pelo seletor css def _get_subtitle(self): subtitle = self.bs.select_one('h1.entry-title') return subtitle.get_text() # localiza e parsea a data para formato datetime obj def _get_published_date(self): state = str(self.bs.select('.entry-date')) match_date = re.search(r'([0-9])+([/])+([0-9])+([/])+([0-9])+', state).group(0) date_published = datetime.strptime(match_date, "%d/%m/%Y").date() return date_published # localiza os paragrafos def _get_content(self): paragraphs_list = [] paragraphs = self.bs.select('.td-post-content > p') for paragraph in paragraphs: if len(paragraph.text) > 20: paragraphs_list.append(paragraph.text.strip()) return ' '.join(paragraphs_list)
ds = [] count = 0 for info in match_info: date, countries, ground, href, parent_href = info count += 1 print(count) print(parent_href) print(href) page = requests.get(href) soup = BeautifulSoup(page.content, 'html.parser') table = soup.select_one('body > table:nth-child(2) > tr > td > table:nth-child(3)') if is_aborted(soup): print('Detect match aborted') continue elif is_conceded(soup): print('Detect match conceded') continue elif table is None: print('Possibly match aborted') break tr_all = table.select('tr') dt = []
def scrap_guy_hoquet(data_folder='data', replace_strategy='abort'): """ Web scrapping function for www.guy-hoquet.com meant to retrieve relevant info from property ads in Ile-de-France. Parameters ---------- data_folder: str, defaut 'data' path of the folder where the data will be written, created when needed replace_strategy: str, any from ['abort', 'replace'], default 'abort' strategy to follow if a file with the same name as the data file already exists Returns ------- None """ url = 'https://www.guy-hoquet.com/biens/result#1&p=1&f10=2&f20=75_c2,77_c2,78_c2,91_c2,92_c2,93_c2,94_c2,95_c2&f30=appartement,maison' links = [] driver = webdriver.Firefox() driver.implicitly_wait(5) # seconds driver.get(url) driver.find_element_by_css_selector('div#accept-all-cookies').click() links.extend([ a.get_attribute('href') for a in driver.find_elements_by_css_selector('a.property_link_block') ]) while True: try: driver.find_element_by_css_selector('li.page-item.next a').click() except NoSuchElementException: break links.extend([ a.get_attribute('href') for a in driver.find_elements_by_css_selector('a.property_link_block') ]) driver.close() data = [] for link in tqdm(links): soup = BeautifulSoup(requests.get(link).content) try: prop_type = soup.select_one('h1.name.property-name').text city = soup.select_one('div.add').text price = soup.select_one('div.price').text.replace('\n', '').strip() descr = soup.select_one('span.description-more').text.replace( '\n', '').replace('Voir moins', '').strip() feats = [tag.text for tag in soup.select('div.ttl')] feats2 = [ re.sub(r'\s+', ' ', re.sub(r'\n+', '', tag.text)).strip() for tag in soup.select('div.horaires-item') ] neighborhood = re.sub( r'\s+', ' ', re.sub( r'\n+', '', soup.select_one('div.quartier-info.mt-4').text)).strip() except AttributeError: continue data.append( [prop_type, city, price, descr, feats, feats2, neighborhood]) df = pd.DataFrame(data, columns=[ 'prop_type', 'city', 'price', 'descr', 'feats', 'feats2', 'neighborhood' ]) # Check if data file name already exists : if so follow replace_strategy, if not then create it filename = f'guy_hoquet_{dt.now().year}_{dt.now().month}_{dt.now().day}.csv' if not os.path.isfile(os.path.join(data_folder, filename)): df.to_csv(os.path.join(data_folder, filename), sep='|', index=False) else: if replace_strategy == 'abort': raise FileExistsError( f"File {os.path.join(data_folder, file_name)} already exists. Scraping aborted. To replace the existing file, change replace_strategy to 'replace'." ) elif replace_strategy == 'replace': df.to_csv(os.path.join(data_folder, filename), sep='|', index=False)
def scrape(): browser = init_browser() # NASA Mars fact data dictionary mars_fact_data = {} # NASA Mars News # URL of page to be scraped nasa_url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" # Retrieve page with the requests module response = requests.get(nasa_url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(response.text, 'html.parser') results = soup.find_all('div', class_="slide") results[0] news_title = soup.find('div', class_="content_title").text.strip() news_p = soup.find('div', class_="rollover_description_inner").text.strip() # Store in mars_fact_data dictionary mars_fact_data['news_title'] = news_title mars_fact_data['news_paragraph'] = news_p # JPL Mars Space Images - Feature Image # URL of page to be scraped jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" # Retrieve page with the requests module response = requests.get(jpl_url) browser.visit(jpl_url) time.sleep(3) browser.find_by_id("full_image").click() time.sleep(3) browser.find_link_by_partial_text('more info').click() time.sleep(3) html = browser.html # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(html, 'html.parser') # Relative image url for featured image img_url_rel = soup.select_one('figure.lede a img').get("src") # Featured image url featured_image_url = f'https://www.jpl.nasa.gov{img_url_rel}' # Store in mars_fact_data dictionary mars_fact_data["featured_image"] = featured_image_url # Web Scraping - Mars Weather twitter website # URL of page to be scraped twitter_url = "https://www.twitter.com/marswxreport?lang=en" # Retrieve page with the requests module response = requests.get(twitter_url) browser.visit(twitter_url) time.sleep(3) html = browser.html weather_soup = BeautifulSoup(html, 'html.parser') # find tweet with weather info results = weather_soup.find_all( "div", class_= "css-901oao r-hkyrab r-1qd0xha r-a023e6 r-16dba41 r-ad9z0x r-bcqeeo r-bnwqim r-qvutc0" ) # loop through the results to look for InSight and Sol text # for tweet in results: # if 'InSight' and 'Sol' in tweet: # print(tweet) # break # mars_weather = tweet.text.strip() mars_weather = 'InSight sol 457 (2020-03-10) low -95.7ºC (-140.3ºF) high -9.1ºC (15.6ºF) winds from the SSE at 6.5 m/s (14.5 mph) gusting to 21.0 m/s (46.9 mph) pressure at 6.30 hPa' # Store in mars_fact_data dictionary mars_fact_data["mars_weather"] = mars_weather # Mars Facts - Table # URL of page to be scraped Marsfacts_url = "https://space-facts.com/mars/" tables = pd.read_html(Marsfacts_url) df = tables[0] df.columns = ["Description", "values"] df.set_index("Description", inplace=True) # Convert dataframe to html html_table = df.to_html(classes="table table-striped") html_table = html_table.replace("\n", "") # Store in mars_fact_data dictionary mars_fact_data["mars_facts_table"] = html_table # Mars Hemispheres # Base url hemispheres_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" # Retrieve page with the requests module response = requests.get(hemispheres_url) browser.visit(hemispheres_url) time.sleep(3) # Get a List of All the Hemispheres image_hemisphere_urls = [] links = browser.find_by_css("a.product-item h3") for item in range(len(links)): hemisphere = {} # Find element on each Loop browser.find_by_css("a.product-item h3")[item].click() time.sleep(3) # Identify sample image anchor tag and extract <href> sample_element = browser.find_by_text("Sample").first hemisphere["img_url"] = sample_element["href"] # Get title for each hemisphere hemisphere["title"] = browser.find_by_css("h2.title").text # Append the hemisphere object to the list image_hemisphere_urls.append(hemisphere) # Navigate backwards browser.back() # Store in mars_fact_data dictionary mars_fact_data["image_mars_hemispheres"] = image_hemisphere_urls return mars_fact_data
from bs4 import BeautifulSoup import sys, io sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8') sys.stderr = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8') fn = open('cars.html', encoding='utf-8') soup = BeautifulSoup(fn, 'html.parser') def car_func(select): print('car_func', soup.select_one(select).string) car_lambda = lambda q: print('car_lambda', soup.select_one(q).string) car_func('#gr') car_func('li#gr') car_func('ul > li#gr') car_func("#cars > #gr") car_func("li[id='gr']") car_lambda('#gr') car_lambda('li#gr') car_lambda('ul > li#gr') car_lambda("#cars > #gr") car_lambda('#gr') car_lambda("li[id='gr']") print('car_func', soup.select('li')[3].string) print('car_func', soup.find_all('li')[3].string)
for i in range(0, 50): #ChromeDriver로 접속, 자원 로딩시간 3초 driver = webdriver.Chrome('./chromedriver') url = main_url + url_path[i]['url_path'] driver.get(url) # 드라이버에 해당 url의 웹페이지를 띄웁니다. sleep(1) # 페이지가 로딩되는 동안 1초 간 기다립니다. req = driver.page_source # html 정보를 가져옵니다. driver.quit() # 정보를 가져왔으므로 드라이버는 꺼줍니다. soup = BeautifulSoup(req, 'html.parser') header = soup.select_one('#original_header') poster_path = header.select_one('div.poster img.poster')['src'] poster_url = main_url + poster_path info = header.select_one('section.header') title = info.select_one('h2 > a').text runtime = info.select_one('div.facts > span.runtime').text.strip() genres = info.select('div.facts > span.genres > a') genre = "" for g in genres: genre = genre + " " + g.text
def scrap_orpi(data_folder='data', replace_strategy='abort'): """ Web scrapping function for www.orpi.com meant to retrieve relevant info from property ads in Ile-de-France. Parameters ---------- data_folder: str, defaut 'data' path of the folder where the data will be written, created when needed replace_strategy: str, any from ['abort', 'replace'], default 'abort' strategy to follow if a file with the same name as the data file already exists Returns ------- None """ BASE_URL = 'https://www.orpi.com/recherche/rent?' depts = [ 'paris', 'seine-et-marne', 'yvelines', 'essonne', 'hauts-de-seine', 'seine-saint-denis', 'val-de-marne', 'val-d-oise' ] links = {dept: [] for dept in depts} print('Getting links to property ads for each département ...') for dept in tqdm(depts): url = f'{BASE_URL}transaction=rent&resultUrl=&realEstateTypes[0]=maison&realEstateTypes[1]=appartement&locations[0][value]={dept}&agency=&minSurface=&maxSurface=&newBuild=&oldBuild=&minPrice=&maxPrice=&sort=date-down&layoutType=mixte&nbBedrooms=&page=&minLotSurface=&maxLotSurface=&minStoryLocation=&maxStoryLocation=' driver = webdriver.Firefox() driver.get(url) # accept cookies driver.find_element_by_css_selector('button.c-btn.c-btn--lg').click() # append property ads links soup = BeautifulSoup(driver.page_source) links[dept].extend([ a.get('href') for a in soup.select('a.u-link-unstyled.c-overlay__link') ]) # repeat for every page next_page = driver.find_elements_by_css_selector('a.c-pagination__link')[-1] \ .find_element_by_css_selector('span') \ .text == 'Suivant' while next_page: driver.find_elements_by_css_selector( 'a.c-pagination__link')[-1].click() next_page = driver.find_elements_by_css_selector('a.c-pagination__link')[-1] \ .find_element_by_css_selector('span') \ .text == 'Suivant' soup = BeautifulSoup(driver.page_source) links[dept].extend([ a.get('href') for a in soup.select('a.u-link-unstyled.c-overlay__link') ]) driver.close() print('\n') word2num = { 'paris': 75, 'seine-et-marne': 77, 'yvelines': 78, 'essonne': 91, 'hauts-de-seine': 92, 'seine-saint-denis': 93, 'val-de-marne': 94, 'val-d-oise': 95 } data = [] for dept in links.keys(): print(f'Scraping {dept} ...') for link in tqdm(links[dept]): url = 'https://www.orpi.com' + link soup = BeautifulSoup(requests.get(url).content) try: ref = soup.select_one('span.u-text-xs').text except AttributeError: continue prop_type = soup.select_one('span.u-text-xl').text.replace( '\n', '').strip() rooms, surface = soup.select_one( 'span.u-h3.u-color-primary').text.split(' • ') city = soup.select_one('span.u-text-lg').text price = soup.select_one('span.u-h1').text.replace('\xa0', '') descr = soup.select_one( 'div.c-section__inner div.o-container p').text.replace( '\n', '').strip() feats = [span.text for span in soup.select('span.c-badge__text')] try: conso = soup.select_one( 'abbr.c-dpe__index.c-dpe__index--5').text except AttributeError: conso = '' try: emiss = soup.select_one( 'abbr.c-dpe__index.c-dpe__index--3').text except AttributeError: emiss = '' data.append([ ref, prop_type, city, word2num[dept], rooms, surface, price, descr, conso, emiss, feats ]) print('\n') df = pd.DataFrame(data, columns=[ 'ref', 'prop_type', 'city', 'dept', 'rooms', 'surface', 'price', 'descr', 'conso', 'emiss', 'feats' ]) # Check if data file name already exists : if so follow replace_strategy, if not then create it filename = f'orpi_{dt.now().year}_{dt.now().month}_{dt.now().day}.csv' if not os.path.isfile(os.path.join(data_folder, filename)): df.to_csv(os.path.join(data_folder, filename), sep='|', index=False) else: if replace_strategy == 'abort': raise FileExistsError( f"File {os.path.join(data_folder, file_name)} already exists. Scraping aborted. To replace the existing file, change replace_strategy to 'replace'." ) elif replace_strategy == 'replace': df.to_csv(os.path.join(data_folder, filename), sep='|', index=False)
* 서버에 과부하를 주는 등의 피해를 줄 수 있으므로, 법적인 문제에 대해 전문적으로 알아볼 것 * 특히, 학습용이 아닌 상용을 목적으로 크롤링 및 스크래핑시에는 더욱 주의할 것 * robots.txt 에서 크롤링 가능 여부 확인 .검색엔진 크롤링 봇들이 해당 파일 조회 후 수집 여부 결정 .참조: https://searchadvisor.naver.com/guide/seo-basic-robots ''' import requests # 웹 페이지 접속 from bs4 import BeautifulSoup # HTML 문서 파싱(구문 분석) res = requests.get('https://finance.naver.com') print(res.status_code) # print(res.text) bs = BeautifulSoup(res.text, 'lxml') # 구문분석기: 'lxml', 'html.parser', ... title = bs.select_one('#container > div.aside > div.group_aside > div.aside_area.aside_popular > h3') print(title.get_text()) # get_text(): 해당 HTML 요소의 텍스트 추출 # company_name = bs.select_one('#container > div.aside > div.group_aside > div.aside_area.aside_popular > table > tbody > tr.down > th > a') # print(company_name.get_text()) # company_name = bs.select_one('#container > div.aside > div.group_aside > div.aside_area.aside_popular > table > tbody > tr:nth-of-type(2) > th > a') # 퀴즈: HMM 가져오기 # print(company_name.get_text()) # 부모 요소 찾기 tbody = bs.select_one('#container > div.aside > div.group_aside > div.aside_area.aside_popular > table > tbody') # 자식 요소 찾기 trs = tbody.select('tr') # print(trs)
def parse_inner2(url): session = requests.Session() session.max_redirects = 9999999 dct = {} # url = url.strip('"') page = session.get(url, headers=headers, verify=False) soup = BeautifulSoup(page.content, "html.parser") Event = soup.select_one("div#calevt_titre").contents[-1].strip() Race = soup.select_one( "div#race-container h2").contents[-1].strip() #get_text(strip=True) WEBSITE = soup.select_one("div#calevt_titre > div > a.web", href=True) img_url = soup.select_one("div#im-container a", href=True) All_info = soup.select("div#calevt_fich tr") #get_text(strip=True) lst_1 = [] dct_2 = {} registr_url = "" for info in All_info: tds = info.select("td") if len(tds) == 2: dct_2[tds[0].get_text().strip()] = tds[1].get_text().strip() elif len(tds) == 1: if tds[0].get_text().strip() != "": if tds[0].select_one("a", href=True) is not None: registr_url = tds[0].select_one("a", href=True)["href"] else: lst_1.append(tds[0].get_text().strip()) try: registr_fee = lst_1[lst_1.index("Registration fees") + 1] except ValueError: registr_fee = "" try: registr_open = dct_2["Opening of registration"] except: registr_open = "" try: registr_close = dct_2["Closure of registration"] except: registr_close = "" date_time = dct_2["Date and time of start"].split() starting_time = date_time.pop().strip() location_start = dct_2["Location of start"].split() date = " ".join(date_time) try: starting_point = location_start[0] except: starting_point = "" try: country = re.search(r'\((.*?)\)', dct_2["Location of start"].strip()).group(1) except: country = "" try: distance = re.sub('[(){}<>]', '', dct_2["Distance"]).split()[0].strip() except: distance = "" try: elevation_gain = dct_2["Ascent"].strip() except: elevation_gain = "" try: descent = dct_2["Descent"].strip() except: descent = "" try: refreshment_points = dct_2["Refreshment points"].strip() except: refreshment_points = "" try: time_limit = dct_2["Maximum time"].strip() except: time_limit = "" source_url = url try: course_url = soup.select_one("div#calevt_fich iframe", src=True)["src"] except: course_url = "" try: participants = dct_2["Number of participants"].strip() except: participants = "" description = soup.select("div.content p") Description = "" if len(description) != 0: for c, desc in enumerate(description): if c == 1: Description += "\nDescription in English\n" Description += desc.get_text() else: Description = "" if img_url is not None: img_url = domain + img_url["href"] else: img_url = "" if WEBSITE is not None: WEBSITE = WEBSITE["href"] else: WEBSITE = "" logo = soup.select_one("div#calevt_lst img", src=True) if logo is not None: logo_url = logo["src"] else: logo_url = "" th = soup.select_one("div#calevt_fich tr th", onclick=True) if th is not None: stage = th.select_one("a.rightarr", onclick=True) params = eval(stage['onclick'].split(";")[0]) global sum_distance, sum_elevation_gain, sum_descent, sum_refreshment_points, sum_time_limit sum_distance = sum_elevation_gain = sum_descent = sum_refreshment_points = sum_time_limit = 0 try: sum_distance = reprDist(distance)[0] except Exception as ex: pass try: sum_elevation_gain = reprDist(elevation_gain)[0] except Exception as ex: pass try: sum_descent = reprDist(descent)[0] except Exception as ex: pass try: sum_refreshment_points = reprDist(refreshment_points)[0] except Exception as ex: pass try: sum_time_limit = get_sec(time_limit) except Exception as ex: sum_time_limit = 0 pass calcSUMS(params) sum_time_limit = get_str(sum_time_limit) else: sum_distance = distance sum_elevation_gain = elevation_gain sum_descent = descent sum_refreshment_points = refreshment_points sum_time_limit = time_limit dct["Event"] = Event dct["Race"] = Race dct["Description"] = Description dct["Participants"] = participants dct["Registration Opens"] = registr_open dct["Registration Closes"] = registr_close dct["Entry Fee"] = registr_fee dct["Sign Up"] = registr_url dct["Date"] = date dct["Starting Time"] = starting_time dct["Starting Point"] = starting_point dct["Country"] = country dct["SumDistance"] = sum_distance dct["SumElevation Gain"] = sum_elevation_gain dct["SumDescent"] = sum_descent dct["SumRefreshment Points"] = sum_refreshment_points dct["SumTimeLimit"] = sum_time_limit # dct["Website"] = WEBSITE dct["CourseUrl"] = course_url # dct["CourseFileName"] = "" # dct["LogoPicURL"] = logo_url dct["ProfilePicURL"] = img_url # dct["ProfilePicFile Name"] = "" # dct["SourceUrl"] = source_url return dct
"id":ID, "pass":PASS, "mode":"login_entry_end" } res = session.post(url_login, data=login_info) res.raise_for_status() # for error print("--------------------------------------------------------") ############################################################### # Print User Name # ############################################################### soup_myage = BeautifulSoup(res.text, "html.parser") account_href = soup_myage.select_one(".spotlight li a").attrs["href"] url_account = urljoin(url_login, account_href) res_account = session.get(url_account) res_account.raise_for_status() soup_account = BeautifulSoup(res_account.text, "html.parser") user_name = str((soup_account.select(".section3 h3"))[0])[4:-5].split("/")[0] print("Hello "+ user_name + "!") print("--------------------------------------------------------") ############################################################### # Page Transition # ############################################################### a_list = soup_myage.select(".section.pickup a") favo_a = ""
def scrap(self, paper_views_range=150, paper_downloads_range=20, pages_min=1, pages_max=10, counter=1): papersList = [] for disc in self.disciplineLinks: for page in range(pages_min, pages_max): print("page #" + str(page)) html = urlopen(disc + str(page)) print(disc + str(page)) soup = BS(html, features="lxml") elems = [ self.baseLink + x['href'] + '/pdf' for x in soup.findAll('a') if x['href'].find("article/n/") != -1 ] for elem in elems: html2 = urlopen(elem[:-4]) soup2 = BS(html2, features="lxml") if soup2.select_one( '#body > div.content > div > span > div:nth-child(2) > h1 > i' ) is None: print("Can't collect papers, captcha on CyberLeninka") if papersList: print("Create rawdata.json and stop") with open('rawdata.json', 'w', encoding='utf-8') as f: json.dump(papersList, f, ensure_ascii=False) return True else: print("No papers was found, try again later") return False paperTitle = soup2.select_one( '#body > div.content > div > span > div:nth-child(2) > h1 > i' ).text paperViews = soup2.select_one( '#body > div.content > div > span > div:nth-child(2) > div.infoblock.authors.visible > div.top-cc > div.statitem.views' ).text paperDownloads = soup2.select_one( '#body > div.content > div > span > div:nth-child(2) > div.infoblock.authors.visible > div.top-cc > div.statitem.downloads' ).text print(paperTitle) journal = [ self.baseLink + y['href'] for y in soup2.findAll('a') if y['href'].find("journal/n/") != -1 ] html3 = urlopen(journal[0]) soup3 = BS(html3, features="lxml") title = soup3.findAll('h1')[0].text statItems = [ x.text for x in soup3.findAll("div", {"class": "statitem"}) ] if int(paperViews) > paper_views_range and int( paperDownloads) > paper_downloads_range: isGood = 1 else: isGood = 0 print(isGood) paperObj = { 'journalName': title, 'journalViews': int(statItems[0]), 'journalDownloads': int(statItems[1]), 'journalHirch': int(statItems[2]), 'paperPath': self.baseFolder + self.baseName + str(counter) + ".pdf", 'paperUrl': elem[:-4], 'paperTitle': paperTitle, 'isGood': isGood } papersList.append(paperObj) fileInfo = urlretrieve( elem, self.baseFolder + self.baseName + str(counter) + ".pdf") counter += 1 with open('rawdata.json', 'w', encoding='utf-8') as f: json.dump(papersList, f, ensure_ascii=False) return True
'.player-progress') progress__text_el = player_progress_el.find_element_by_css_selector( '.progress__bar.progress__text') while True: try: track_playing_el = driver.find_element_by_css_selector( '.d-track_playing') track = get_track(track_playing_el) except (NoSuchElementException, StaleElementReferenceException): continue el_html = progress__text_el.get_attribute('outerHTML') root = BeautifulSoup(el_html, 'html.parser') progress_left = to_seconds(root.select_one('.progress__left').text) progress_right = to_seconds(root.select_one('.progress__right').text) if progress_right == 0: continue progress_left_str = seconds_to_str(progress_left) progress_right_str = seconds_to_str(progress_right) print( f'{track.title}. {progress_left_str} / {progress_right_str} ({progress_left / progress_right:.1%})' ) time.sleep(1) finally: if driver:
def resume_page(url): #=== Check Key ===# resume_id = int(re.search('\/\d+\/', url).group(0).replace('/', '')) res = retry(url) soup = BeautifulSoup(res.content, 'lxml') try: #=== Check Not Found ===# info_sum = soup.select_one( "div#box_right div.span11.marL10.padB10.padL20") if info_sum == None: return 0 #=== Extract Info Summary ===# prog = info_sum.select_one("div#progress_resume div")[ 'style'].strip()[-3:].replace('%', '') [div.decompose() for div in info_sum.select("div")] [b.decompose() for b in info_sum.select("b")] info_sum_list = re.sub( '[\s+]', '|', info_sum.text.strip().replace(" ", "")).split("|") info_sum_list = list(filter(lambda _: _ != "", info_sum_list)) #=== Extract Info Resume ===# info_main = soup.select_one("div#resumeDT") resume_update = info_main.select_one( "div.taR.marR10").text.strip().split(":") resume_want_dict = { 'job_type': '', 'asked_job': [], 'asked_location': '', 'asked_salary': '', 'start_in': '', 'work_aboard': '' } resume_want_mapper = { 'รูปแบบงาน:': 'job_type', 'สาขาอาชีพ:': 'asked_job', 'ตำแหน่ง:': 'asked_job', 'พื้นที่ที่ต้องการทำงาน:': 'asked_location', 'เงินเดือนที่ต้องการ:': 'asked_salary', 'ระยะเวลาเริ่มงาน:': 'start_in', 'ยินดีทำงานต่างประเทศ:': 'work_aboard' } resume_want = info_main.select_one( "div#resume_want").select("div.span11.offset1") if len(resume_want) > 0: for row in resume_want: # pair {occupation, position} op_dict = {} for col in row.select('.span6'): key = re.sub('[\d.\s+]', '', col.b.text) val = re.sub('[\s+]', '', col.span.text) key = resume_want_mapper[key] if key == 'asked_job': if len(op_dict) == 0: op_dict['occupation'] = val else: op_dict['position'] = val resume_want_dict[key].append(op_dict) op_dict = {} else: resume_want_dict[key] = val except Exception as e: print("Step info_resume", e) #=== Extract Education ===# try: resume_edu = [] for dl in info_main.select("dl"): key = dl.select_one("dt").text.strip().replace(" :", "") val = dl.select_one("dd") for span in val.select("span"): span.decompose() val = re.sub('[\s+]', '|', val.text.strip().replace(" ", "")) val_list = list(filter(lambda _: _ != "", val.split("|"))) resume_edu.append( (key, val_list[1:3] + val_list[4:]) ) except Exception as e: print("Step resume_edu", e) #=== Extract Experience ===# try: resume_exp = info_main.select("div.row-fluid.jsXp_row.padB5") if resume_exp != None: resume_exp_list = [] for exp in resume_exp: work_info_list = [re.sub('[\s+]', ' ', _.text.strip()) for _ in exp.select_one( "div.o.col000.span6.padV10H20.cor4.bg_lightyellow").select("span.padL10")] work_detail = exp.select_one("div.padB10.bb-code").text.strip() resume_exp_list.append((work_info_list, work_detail)) except Exception as e: print("Step resume_exp", e) #=== Extract Skill[Nosql] ===# try: resume_skill_dict = { 'own_vehicle': '', 'skill_vehicle': '', 'drive_license': '', 'skill_lang': [], 'skill_typing': {'th_wm': '', 'en_wm': ''}, 'skill_other': '' } resume_skill_mapper = { 'ยานพาหนะ': 'own_vehicle', 'ความสามารถในการขับขี่': 'skill_vehicle', 'ใบอนุญาติขับขี่': 'drive_license', 'ทักษะทางภาษา': 'skill_lang', 'ทักษะการพิมพ์ดีด': 'skill_typing', 'ทักษะอื่นๆ': 'skill_other' } resume_skill = info_main.select_one("div#resume_skill") lang_list = [] if resume_skill != None: for skill_soup in resume_skill.select("div.padV10H20 > div.span11.offset1"): # skill_lang try: skill_soup['style'] except KeyError: skill_soup['style'] = None if skill_soup['style'] == "float:left": for row in skill_soup.select('div.span11.offset1'): key = row.select_one( '.span2.bg_lightyellow.taCen.o').text.strip() val_list = [lang_mapper[re.sub( '[\s+]', '', _.text).split(':')[1]] for _ in row.select('.pull-left')] resume_skill_dict['skill_lang'].append( { 'name': key, 'skill': { "listen": val_list[0], "speak": val_list[1], "read": val_list[2], "write": val_list[3] } } ) else: # pair {skill_key, skill_val} try: key, val = re.sub( '[\s+]', '', skill_soup.text).split(':') except: # skill_other too many values continue key = resume_skill_mapper[key] if key == 'skill_typing': val_list = re.findall('\d+', val) try: resume_skill_dict[key]['th_wm'] = val_list[0] except: pass try: resume_skill_dict[key]['en_wm'] = val_list[1] except: pass else: resume_skill_dict[key] = val except Exception as e: print("Step resume_skill", e) #=== Extract Main Info ===# try: # Handle cast String into Integer try: age = int(info_sum_list[1]) except: age = '' try: asked_salary = int( resume_want_dict['asked_salary'].replace(',', '')) except: asked_salary = '' resume_csv_dict = OrderedDict({ # Primary key '_id': resume_id, 'resume_modified': resume_update[1].strip(), 'resume_progress': int(prog), 'gender': info_sum_list[0].replace('ชาย', 'M').replace('หญิง', 'F'), 'age': age, 'exprience': int(info_sum_list[2]), 'job_type': resume_want_dict['job_type'], 'asked_job': resume_want_dict['asked_job'], 'asked_location': resume_want_dict['asked_location'], 'asked_salary': asked_salary, 'start_in': resume_want_dict['start_in'], 'work_aboard': resume_want_dict['work_aboard'], 'edu_hist': [], 'exp_hist': [], 'own_vehicle': resume_skill_dict['own_vehicle'], 'skill_vehicle': resume_skill_dict['skill_vehicle'], 'drive_license': resume_skill_dict['drive_license'], 'skill_lang': resume_skill_dict['skill_lang'], 'skill_typing': resume_skill_dict['skill_typing'], 'skill_other': resume_skill_dict['skill_other'], 'training_hist': [] }) except Exception as e: print("Step resume_csv", e) #=== Extract exp ===# try: if len(resume_exp_list) > 0: for rel in resume_exp_list: result_dict = {} for exp, _ in zip(exp_mapper, rel[0]): result_dict[exp] = _ resume_csv_dict['exp_hist'].append( {'exp_info': result_dict, 'exp_detail': rel[1]}) except Exception as e: print("Step exp_csv", e) #=== Extract education ===# try: if len(resume_edu) > 0: for redu in resume_edu: result_dict = {'edu_year': redu[0].replace( 'กำลังศึกษาอยู่', 'Studying')} for edu, r in zip(edu_mapper, redu[1]): if edu == "edu_level": try: r = edu_dict[r] except KeyError: # print("KeyError: ", r) r = edu_dict[r.replace( '(กำลังศึกษาอยู่)', '')] r = r + ' (Studying)' result_dict[edu] = r resume_csv_dict['edu_hist'].append(result_dict) except Exception as e: print("Step edu_csv", e) #=== Extract training ====# try: training_list = info_main.select_one( "#resume_skill + div").select("div.row-fluid") if training_list != None: for train in training_list: result_dict = {} tl = [re.sub('[\s+]', ' ', _.text.strip()) for _ in train.select("span")] for tm, td in zip(train_mapper, tl): result_dict[tm] = td if len(result_dict) == 0: break else: resume_csv_dict['training_hist'].append(result_dict) except Exception as e: print("Step training_csv", e) return resume_csv_dict
import requests from bs4 import BeautifulSoup nickname=input('닉네임 입력 : ') url =f'https://www.op.gg/summoner/userName={nickname}' response = requests.get(url).text data = BeautifulSoup(response,"html.parser") tier = BeautifulSoup.select_one("#SummonerLayoutContent > div.tabItem.Content.SummonerLayoutContent.summonerLayout-summary > div.SideContent > div.TierBox.Box > div > div.TierRankInfo > div.TierRank") #왜 못불러와 .. print(tier.text)