def get_tiki_product_detail(self, data): logging.warning("Getting product data...") html_doc = data.get('raw_data') if html_doc: soup = BeautifulSoup(html_doc, 'html.parser') categories_container = soup.findChild("ul", {"class": "breadcrumb"}) categories = [] for category in categories_container.find_all("a"): categories.append(category.text) title = soup.findChild("h1", {"id": "product-name"}) gross_price = soup.findChild("span", {"id": "span-list-price"}) net_price = soup.findChild("span", {"id": "span-price"}) try: item_info = { "url": data.get('url'), "title": title.text.replace('\n', ''), "gross_price": get_price(gross_price.text) if gross_price else get_price(net_price.text), "net_price": get_price(net_price.text), "categories": categories, "type": "tiki", 'task_id': self.request.root_id } elk_logger.info(msg="Saved item " + data.get('url'), extra=item_info) return item_info except AttributeError as exc: print("Error while parsing data, crawl again...") celery_app.send_task("crawl_url", queue='priority.high', kwargs={ 'url': data.get('url'), 'required_class': 'item-name', 'label': 'tiki_crawling_product_detail', }, countdown=30, link=get_tiki_product_detail.s(), expires=datetime.now() + timedelta(days=1)) else: celery_app.send_task("crawl_url", queue='priority.high', kwargs={ 'url': data.get('url'), 'required_class': 'item-name', 'label': 'tiki_crawling_product_detail', }, countdown=30, link=get_tiki_product_detail.s(), expires=datetime.now() + timedelta(days=1))
def get_table_header(soup: BeautifulSoup) -> List[str]: """ Table 의 Header 를 반환하는 함수 Parameters ---------- soup: BeautifulSoup Table의 BeautifulSoup Object Returns ------- list of str Table Header """ thead = soup.findChild('thead') if thead: thead_row = thead.findAll('tr') tags = 'th' else: thead = soup.findChild('tbody') thead_row = thead.findAll('tr') first_row = thead_row[0] max_row_span = 1 for col in first_row.findAll('td'): row_span = int(col.attrs.get('rowspan', 1)) if row_span > max_row_span: max_row_span = row_span thead_row = thead_row[0:max_row_span] tags = 'td' columns_name = [] is_empty = [] for idx, row in enumerate(thead_row): for col in row.findAll(tags): row_span = int(col.attrs.get('rowspan', 1)) col_span = int(col.attrs.get('colspan', 1)) if idx == 0: for _ in range(col_span): columns_name.append(col.text) if 1 < row_span: is_empty.append(False) else: is_empty.append(True) else: start_index = 0 for jdx, _ in enumerate(is_empty): if is_empty[jdx] is True: start_index = jdx break for jdx in range(start_index, start_index + col_span): columns_name[jdx] += col.text is_empty[jdx] = False columns_name = [col.replace('\n', ' ') for col in columns_name] return columns_name
def parse(self, response): sel = Selector(response) profile = {'url': response.url, 'skills': [], 'experience': []} # Parse current page URL (public profile URL) # Read Skills section skills_list = sel.xpath('//a[@class="endorse-item-name-text"]').extract() for skill in skills_list: skill = self.remove_tag('a', skill) profile['skills'].append(skill) # List of experience items exp_items = [] # Read Companies and Titles exp_entries = sel.xpath('//div[contains(@id, "experience-") and contains(@id, "-view")]').extract() for exp_entry in exp_entries: b_soup = BeautifulSoup(exp_entry) #Get company name exp_company_matches = b_soup.findChildren('a', href=re.compile(r'prof-exp-company-name')) exp_company = exp_company_matches[len(exp_company_matches) - 1].get_text()\ if len(exp_company_matches) > 0 else None # Get title within company exp_title = b_soup.findChild('a', {'name': 'title'}).get_text() # Get work description exp_desc_match = b_soup.findChild('p', {'class': 'description'}) exp_desc = exp_desc_match.get_text() if exp_desc_match is not None else None # Get work date-locale exp_date_loc = b_soup.findChild('span', {'class': 'experience-date-locale'}) exp_duration_items = exp_date_loc.findChildren('time') exp_is_current = 'Present' in exp_duration_items[1].get_text() exp_duration = re.sub(r'[^a-zA-Z0-9 ]', '', exp_duration_items[2].get_text()).strip() exp_location_item = exp_date_loc.findChild('span', {'class': 'locality'}) exp_location = None if exp_location_item is not None: exp_location = re.sub(r'^[^"]*"', '', exp_location_item.get_text()) exp_location = exp_location.replace("\"", "").strip() exp_items.append(ExperienceItem(exp_is_current, exp_title, exp_company, exp_location, exp_duration, exp_desc)) profile['experience'] = exp_items # Sleep to appease LinkedIn rate limiting time.sleep(5) self.profile_map[response.url] = profile return LinkedInItem(profile)
def get_shopee_product_detail(self, data): logging.warning("Getting product data...") html_doc = data.get('raw_data') if html_doc: soup = BeautifulSoup(html_doc, 'html.parser') categories_html = soup.findAll("a", {"class": "JFOy4z _20XOUy"}) categories = [] for category_html in categories_html: categories.append(category_html.text) title = soup.findChild("span", {"class": "OSgLcw"}) gross_price = soup.findChild("div", {"class": "_3_ISdg"}) net_price = soup.findChild("div", {"class": "_3n5NQx"}) try: item_info = { "url": data.get('url'), "title": title.text.replace('\n', ''), "gross_price": get_price(gross_price.text) if gross_price else get_price(net_price.text), "net_price": get_price(net_price.text), "categories": categories, "type": "shopee", 'task_id': self.request.root_id } elk_logger.info(msg="Saved item " + data.get('url'), extra=item_info) return item_info except AttributeError as exc: print("Error while parsing data, crawl again...") celery_app.send_task( "crawl_url", queue='priority.high', kwargs={ 'url': data.get('url'), 'required_class': '_3n5NQx', 'label': 'crawling_product_detail', }, countdown=30, link=get_shopee_product_detail.s(), expires=datetime.now() + timedelta(days=1) ) else: celery_app.send_task( "crawl_url", queue='priority.high', kwargs={ 'url': data.get('url'), 'required_class': '_3n5NQx', 'label': 'crawling_product_detail', }, countdown=30, link=get_shopee_product_detail.s(), expires=datetime.now() + timedelta(days=1) )
def get_title_and_problem_list(self): url = 'http://acm.hdu.edu.cn/contests/contest_show.php?cid=%d' % \ self.contest_id r = self.session.get(url) soup = BeautifulSoup(r.text, 'html.parser') title = soup.findChild('h1').text if '- Team' in title: # For unification, replace "- Team X" with "X" word = title.split(' ') title = ' '.join(word[:-3]) + ' ' + word[-1] print(title) table = soup.findChild('table') name_list = [] for row in table.findChildren('tr'): name_list.append(row.findChildren('td')[-2].text) return title, name_list[1:]
def getpdfByID(self,rgid): r=requests.get("https://www.researchgate.net/publication/"+str(rgid),headers=browserhdr,timeout=timeout_setting) if (r.status_code is 200): soup=BeautifulSoup(r.text, "html.parser") out=soup.findChild(name="a",attrs={"class":"blue-link js-download rf btn btn-promote"}) link='' if (out): link=out['href'] out=soup.findChild(attrs={'name':"citation_doi"}) doi="" if (out): doi=out['content'] filename=quotefileDOI(doi.lower().strip()) return self.getpdfByLink(link,filename) return False
def get_dic_url(): '''获取上海目录下小于100页的商圈拼音(用于拼接URL)''' req_url = raw_url.format(busi_area="") req = CheatRequests([[req_url]]) content = req.get_cheat_first_content[0].decode("utf-8") bs = BeautifulSoup(content, "lxml") dic_list = bs.findChild("div", { "class": "filter-box" }).findChild("div", { "id": "filter-options" }).findChild("dl", { "class": "dl-lst clear" }).findChild("dd").findChild("div", { "class": "option-list" }).findChildren("a") dic_list = [ re.findall("href=\"/zufang/(.+)/\"", str(dic))[0] for dic in dic_list[1:] ] # 保证每个地标中包含的页码数量小于100页 busi_list_result = [] dic_list_result = dic_list[:] for dic in dic_list: pages = get_pages(dic) if get_pages(dic) > 100: # 删除超过100页的内容 dic_list_result.remove(dic) req_url = raw_url.format(busi_area=dic) req = CheatRequests([[req_url]]) content = req.get_cheat_first_content[0].decode("utf-8") bs = BeautifulSoup(content, "lxml") busi_list = bs.findChild("div", { "class": "filter-box" }).findChild("div", { "id": "filter-options" }).findChild("dl", { "class": "dl-lst clear" }).findChild("dd").findChild("div", { "class": "option-list sub-option-list" }).findChildren("a") busi_list = [ re.findall("href=\"/zufang/(.+)/\"", str(busi))[0] for busi in busi_list[1:] ] busi_list_result += busi_list dic_list_result += busi_list_result return dic_list_result
def get_highlight(search, index=0, list_index=False): response = json.loads(get(highlights_url.format(search=search)).text) urls = [] title_index = {} idx = 1 for doc in response['docs']: title = doc['title'] urls.append((title, doc['url'])) title_index[idx] = title idx = idx + 1 if list_index: if title_index: body = "" for index, title in title_index.items(): body += f"{index} - {title}" + "\n" return discord.Embed(title="Highlight Index", description=body) else: raise NoResultsError(f'No results for {search}') else: try: title = '**' + urls[index][0] + '**' video_url = urls[index][1] except IndexError: raise NoResultsError(f'No results for {search}') soup = BeautifulSoup(get(video_url).text, 'html.parser') video = soup.findChild(lambda tag: tag.name == 'meta' and tag.get( 'itemprop') == 'contentURL' and tag.get('content').endswith('.mp4') ).get('content') if video: return title, video else: raise NoResultsError(f'Error parsing video url for {search}')
def retrieveData(api_url): try: response = requests.get(api_url) except requests.exceptions.ConnectionError as e: print('Error', e.args) exit(1) #the whole html of website html = response.content # print(html) #initialize bsoup html parser soup = BeautifulSoup(html, 'html.parser') containerhtml = soup.findChild('div', class_='mainbody').findChild('div', class_='container') print(containerhtml) signupValidStr = containerhtml.find('h1') # print(signupValidStr) slotAvailability = containerhtml.find('h1') # print(slotAvailability) #if the signupgenius website no longer exists if "The Sign Up Was Not Found" in signupValidStr: return False #if the website has available spots for covid vaccine sign up if "NO SLOTS AVAILABLE. SIGN UP IS FULL." in slotAvailability: return False return True
def download_icon(self): response = requests.get(self.url_for()) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") icon_url = soup.findChild("div", attrs={"class": "avatar-wrapper"}).findChild("img").get("src") return self._client.downloader.download(icon_url)
def scrap_lemonde(recherche): url = "https://www.lemonde.fr/recherche/?search_keywords={}".format(recherche) lemonde = requests.get(url) html_lemonde = BeautifulSoup(lemonde.text,'html.parser') contenu_recherche = html_lemonde.findChild("section",attrs={"class":"js-river-search"}) #print(dir(contenu_recherche)) list_titles = list() list_des = list() for section in contenu_recherche.children: if section.name == "section": if section.a: print("----------------------") title = section.a.h3.get_text() description = section.a.p.get_text() list_titles.append(title) list_des.append(description) # print(title) # print("***") # print(description) streamlit.sidebar.header("Scrapy le monde") streamlit.sidebar.info("Exos pour extraction d'information concernant le coronavirus") streamlit.error("errorrrrr") streamlit.title("Exo scrapy le monde coronavirus title et description")
def open_mobility_file(): TEMPORAL_FILE = '/tmp/movement-data.csv' MOVEMENT_BASE_URL = 'https://data.humdata.org' MOVEMENT_URL = MOVEMENT_BASE_URL + '/dataset/movement-range-maps' if not os.path.exists(TEMPORAL_FILE): cdata = requests.get(MOVEMENT_URL) cdata = BeautifulSoup(cdata.text) links = cdata.findChild('div', { 'id': 'data-resources-0' }).find_all('a') data_link = next( _ for _ in links if 'href' in _.attrs and _.attrs['href'].endswith('zip')) data_link = data_link.attrs['href'] data_link = MOVEMENT_BASE_URL + data_link if data_link.startswith( '/') else data_link data_container = requests.get(data_link, stream=True) data_container = ZipFile(io.BytesIO(data_container.content)) data_file = next(_ for _ in data_container.filelist if _.filename.startswith('movement-range')) data_file = data_container.open(data_file.filename) with open(TEMPORAL_FILE, 'wb') as disk_file: shutil.copyfileobj(data_file, disk_file) data_file.close() return open(TEMPORAL_FILE)
def find_data_login(session, url, user, password): form = {'action': '', 'data':{}} data = form.get('data') req = session.get(url) soup = BeautifulSoup(req.content,'html.parser') if soup.findChild('input',{'type':'password'}): form_action = soup.find('form') action = form_action.get('action', '') method = form_action.get('method', '') form['action'] = action form['method'] = method for input in soup.find_all('input'): if input.has_attr('name'): if input.get('type').lower() == 'submit': if input.has_attr('name'): data[str(input.get('name'))] = str(input.get('value')) else: data[str(input.get('name'))] = 'submit' elif input.get('type').lower() == 'password': data[str(input.get('name'))] = password elif input.get('type').lower() == 'text' and input.get('value') == None: data[str(input.get('name'))] = user else: data[str(input.get('name'))] = str(input.get('value')) return form
def _parse_request_to_movie(request): soup = BeautifulSoup(request.text, "html.parser") title = soup.findChild("h1", {"class": "filmCoverSection__title"}).text description_pl = soup.find("div", { "class": "filmPosterSection__plot" }).text premiere_year = soup.find("span", { "class": "filmCoverSection__year" }).text movie_time = soup.find("span", { "class": "filmCoverSection__filmTime" }).text movie_rating_value = soup.find("span", {"class", "filmRating__rateValue"}).text movie_rating_count = soup.find("span", { "class": "filmRating__count" }).text movie_details = { "title": title, "description_pl": description_pl, "premiere_year": premiere_year, "movie_time": movie_time, "movie_rating_value": movie_rating_value, "movie_rating_count": movie_rating_count } movie = Movie.parse(movie_details) return movie
def get_current_AMI_info(): url = "https://aws.amazon.com/amazon-linux-ami/" page = urllib2.urlopen(url).read() soup = BeautifulSoup(page) tables = soup.findChild('table') rows = tables.findChildren(['th', 'tr']) return rows
def searchForClothesStockX(query): logger = logging.getLogger('mylogger') url = "https://stockx.com/search?s=" + query driver = webdriver.Firefox( executable_path='/mnt/c/GeckoDriver/geckodriver.exe') results = [] try: driver.get(url) WebDriverWait(driver, 60).until( EC.presence_of_element_located( (By.XPATH, '//div[@class="tile browse-tile"]'))) scrollDownWindow(driver) #start extracting listings soup = BeautifulSoup(driver.page_source) soup = soup.find("div", {"class": "browse-grid"}) listings = soup.findChild().contents for listing in listings: listingToAdd = extractInfo(listing) if listingToAdd: results.append(listingToAdd) return results except TimeoutException: raise ParseError
def get_blurb(first, last, sport, player_url=None): # for some weird reason its actually better to omit the first name in the search form response = get(player_url if player_url else blurb_search_url.format(first="", last=last, sport=sport)) soup = BeautifulSoup(response.text, 'html.parser') # did we land a result page? if not soup.findChild('div', class_='RW_pn'): name_map = {} results_table = soup.find('table', attrs={'id':'cp1_tblSearchResults'}) # filter results, omitting duplicate "position" links that don't include the player's name filtered_results = results_table.findChildren(lambda tag: tag.name == 'a' and 'player' in tag['href'] and len(tag.text) > 3) if not filtered_results: raise NoResultsError("No results for %s %s" % (first, last)) else: for result in filtered_results: name = " ".join(result.text.split()) name_map[result] = SequenceMatcher(None, first + " " + last, name).ratio() # sort names by similarity to search criteria sorted_names = sorted(name_map, key=name_map.get, reverse=True) return get_blurb(first, last, sport, player_url='http://www.rotoworld.com' + sorted_names[0].get('href')) else: news = soup.findChildren('div', class_='playernews') if news: recent_news = news[0] report = recent_news.find('div', class_='report') impact = recent_news.find('div', class_='impact') blurb = report.text + '\n\n' + impact.text return blurb else: raise NoResultsError("No recent player news for %s %s" % (first, last))
def get_tiki_products_url(self, data): # Return list product url from result page crawled html_doc = data.get('raw_data') soup = BeautifulSoup(html_doc, 'html.parser') items_container = soup.findChild("div", {"class": "product-box-list"}) time.sleep(5) item_urls = items_container.find_all('a')[:10] urls = [] for item_url in item_urls: if re.match(r"^https?:\/\/(w{3}\.)?tiki.vn\/.*?$", item_url.get('href')): urls.append(item_url.get('href')) logging.warning("end loop...") from celery import chord chord( craw_tiki_url.subtask(queue='priority.high', kwargs={ 'url': url, 'required_class': 'item-name', 'label': 'tiki_crawling_product_detail', }, countdown=30, link=get_tiki_product_detail.s(), expires=datetime.now() + timedelta(days=1)) for url in urls)(on_finish_crawl_tiki.s()) response = {'search_url': data.get('url'), 'prods_urls': urls} elk_logger.info(response) return response
def get_postion(position_id, postion_dir): # 构造请求数据 url = "https://www.lagou.com/jobs/%s.html" % (position_id) headers = { 'Cookie': 'TG-TRACK-CODE=index_search; SEARCH_ID=e8222f3471a44abf85093f79d876f759; JSESSIONID=ABAAABAABEEAAJA080B57268659EBB1C73E65E8835B1D1D; WEBTJ-ID=20181111160721-16701cf96c8949-0fc6f60feec025-48183706-1024000-16701cf96c94d2; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1541944837; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1541923641,1541944111; LGRID=20181111215853-ec514a2a-e5b9-11e8-9aa0-525400f775ce; LGSID=20181111214647-3bdcc9f7-e5b8-11e8-9a9f-525400f775ce; _ga=GA1.2.1318630155.1541923641; _gat=1; _gid=GA1.2.1216768844.1541923642; PRE_HOST=www.baidu.com; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Flp%2Fhtml%2Fcommon.html%3Futm_source%3Dm_cf_cpt_baidu_pc; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E5%258B%25BE%25E7%25BD%2591%26rsv_spt%3D1%26rsv_iqid%3D0xf57dfecd000124aa%26issp%3D1%26f%3D8%26rsv_bp%3D1%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3Dcn%26tn%3Dbaiduhome_pg%26rsv_enter%3D0%26oq%3Dpython%252520%2525E6%252595%2525B0%2525E6%25258D%2525AE%2525E6%25258C%252596%2525E6%25258E%252598%26rsv_t%3De111J%252FhqMM1XxboP3SPnY%252Fw6ah3WItaAjhCUX2DgoGHa814Syn2DSmf%252F0Kh31gQZTnH%252B%26inputT%3D6259%26rsv_pq%3D918956f400061b60%26rsv_sug3%3D280%26rsv_sug1%3D254%26rsv_sug7%3D100%26bs%3Dpython%2520%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598; PRE_UTM=m_cf_cpt_baidu_pc; index_location_city=%E5%85%A8%E5%9B%BD; LGUID=20181111160538-9328c957-e588-11e8-9a22-525400f775ce; user_trace_token=20181111160538-9328c039-e588-11e8-9a22-525400f775ce', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Host': 'www.lagou.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0.1 Safari/605.1.15', 'Accept-Language': 'zh-cn', 'Referer': 'https://www.lagou.com/', 'Connection': 'keep-alive' } session = requests.Session() session.headers = headers response = session.get(url) doc = response.content.decode() soup = BeautifulSoup(doc, 'html.parser') # soup = BeautifulSoup(response.content,'html.parser',from_encoding = 'utf-8') try: re = soup.findChild(name='dd', class_='job_bt') rr = re.find("div").find_all("p") if not os.path.exists(postion_dir + "/"): # print("不存在") os.mkdir(postion_dir + "/") with open(postion_dir + "/%s.txt" % (position_id), 'w') as fd: for p in rr: # print(p) fd.write(p.text + "\n") except: pass
def test_only_the_custom_region_is_created(self): caption_set = DFXPReader().read( SAMPLE_DFXP_TO_RENDER_WITH_ONLY_DEFAULT_POSITIONING_INPUT) new_region = Layout( alignment=Alignment( HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP ) ) dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set) # Using a different parser, because this preserves letter case # The output file is ok, but when parsing it, the "regular" parses # loses letter case. layout = BeautifulSoup(dfxp, features='xml').findChild('layout') self.assertEqual(len(layout.findChildren('region')), 1) region = layout.findChild('region') text_align = region['tts:textAlign'] display_align = region['tts:displayAlign'] internal_alignment = _create_internal_alignment(text_align, display_align) # noqa self.assertEqual(internal_alignment.horizontal, HorizontalAlignmentEnum.LEFT) # noqa self.assertEqual(internal_alignment.vertical, VerticalAlignmentEnum.TOP) # noqa
def retrieveData(api_url): try: response = requests.get(api_url) #print("--- response headers: ",response.headers) except requests.exceptions.ConnectionError as e: print('Error', e.args) exit(1) html = response.content # print("--- html: ", html) # parsing html with BS soup = BeautifulSoup(html, 'html.parser') elements = soup.findChild( "ol", id="accordion-rankings-184224").findChildren("li") datalist = [] for i in range(0, len(elements)): row = [] univ_name = elements[i].find('span').getText() # print (univ_name) row.append(univ_name) location = elements[i].find('p').getText() row.append(location) row.append(i+1) # append ranking, which is the iteration desc = elements[i].findChild( "div", class_="inner-content").find('p').getText() row.append(desc) url = elements[i].findChild('a')["href"] row.append(url) # url of university row.append(api_url) # url of the site # print(row) datalist.append(row) return datalist
def __init__( self, video_game: str, event: str = "", player1: str = "", player2: str = "", character1: str = "", character2: str = "", caster1: str = "", caster2: str = "", num_workers: int = 10, num_page_workers: int = 2, verbose: bool = False, ): self.base_url = self.URL(video_game, event, player1, player2, character1, character2, caster1, caster2) self.num_workers = num_workers self.num_page_workers = min(num_page_workers, self.num_workers) self.session = FuturesSession(max_workers=self.num_workers) page_content = self.request(str(self.base_url)).result().content page_soup = BeautifulSoup(page_content, "lxml") self.num_pages = 1 last_page_tag = page_soup.findChild("a", title="Go to last page") if last_page_tag: self.num_pages = int( re.search(r"page=([\d]+)", last_page_tag["href"]).group(1)) self.verbose = verbose
def scrape_vod_page( self, vod_id: str, vod_request: Future) -> Tuple[List[str], List[Vod.Caster]]: vod_content = vod_request.result().content vod_strainer = SoupStrainer("div", class_="region-inner clearfix") vod_soup = BeautifulSoup(vod_content, "lxml", parse_only=vod_strainer) content = vod_soup.findChild(recursive=False) try: video_ids = [ re.search(r"^([^?]*)", v["data-vod"]).group(1) for v in content.findChildren( "div", class_="js-video widescreen", recursive=False) ] if len(video_ids) == 0: raise InvalidVideoError(vod_id) casters = [] casters_tag = content.findChild("div", class_="field-items") if casters_tag: casters = [ Vod.Caster(c.getText()) for c in casters_tag.findChildren(recursive=False) ] return (video_ids, casters) except KeyError: raise InvalidVideoError(vod_id)
def test_only_the_custom_region_is_created(self): caption_set = DFXPReader().read( SAMPLE_DFXP_TO_RENDER_WITH_ONLY_DEFAULT_POSITIONING_INPUT) new_region = Layout(alignment=Alignment(HorizontalAlignmentEnum.LEFT, VerticalAlignmentEnum.TOP)) dfxp = SinglePositioningDFXPWriter(new_region).write(caption_set) # Using a different parser, because this preserves letter case # The output file is ok, but when parsing it, the "regular" parses # loses letter case. layout = BeautifulSoup(dfxp, features='xml').findChild('layout') self.assertEqual(len(layout.findChildren('region')), 1) region = layout.findChild('region') text_align = region['tts:textAlign'] display_align = region['tts:displayAlign'] internal_alignment = _create_internal_alignment( text_align, display_align) # noqa self.assertEqual(internal_alignment.horizontal, HorizontalAlignmentEnum.LEFT) # noqa self.assertEqual(internal_alignment.vertical, VerticalAlignmentEnum.TOP) # noqa
def main(): #BOROUGH OF JAMESBURG def ToGetItIn_manner(datasets): for data in datasets: p=str(data) p=p.splitlines() p=filter(None,p) st=str(p) st1=st.strip('[') st2=st1.strip(']') st3=st2.strip("'") st4=str(st3.replace("'", "")) print " ".join(st4.split()) datasets=[] url=raw_input("enter the url") #url="http://www.southrivernj.org/elected_officials.html" opener=urllib2.build_opener() print "Builded opener" opener.addheaders=[('user_agent','Chrome/41.0.2243.0')] print "added headers" response=opener.open(url) print "Took Response" soup=BeautifulSoup(response) BoroughOfJamesburg=soup.findChild("div",{'id':'content'}) for row in BoroughOfJamesburg.find_all(['p']): if (True): #print row.get_text().encode('utf-8') dataset = row.get_text().encode('utf-8') datasets.append(dataset) for row in BoroughOfJamesburg.find_all('div'): dataset = row.get_text().encode('utf-8') datasets.append(dataset) #print datasets ToGetItIn_manner(datasets)
def load_annotation(xmlFile): """ Read annotations from for a image from xml file and return a dictionary of the objects and their locations """ with open(xmlFile) as f: xml = f.readlines() xml = ''.join([line.strip('\t') for line in xml]) anno = BeautifulSoup(xml, "html5lib") anno_dic = {} fname = anno.findChild('filename').contents[0] anno_dic['filename'] = fname objs = anno.findAll('object') # print('Number of objects:', len(objs)) objects = [] for obj in objs: obj_name = obj.findChild('name').contents[0] bbox = obj.findChildren('bndbox')[0] xmin = int(bbox.findChildren('xmin')[0].contents[0]) ymin = int(bbox.findChildren('ymin')[0].contents[0]) xmax = int(bbox.findChildren('xmax')[0].contents[0]) ymax = int(bbox.findChildren('ymax')[0].contents[0]) obj_dic = {'object_name': obj_name, 'location': np.array([xmin, ymin, xmax, ymax])} objects.append(obj_dic) anno_dic['annotation'] = objects return anno_dic
def get_page_links(self, date: str) -> List[str]: """ Page Links method ================= function to get the page link for date Arguments --------- date: str date such as "%Y-%m-%d"("2021-05-03") Returns ------- page_links: list of str ex. ['url1', 'url2', ...] """ req = requests.get(self.main_url + date) # HTTP GET Request soup = BeautifulSoup(req.text, "html.parser") page_links = soup.findChild("table", {"class": "Nnavi"}) page_links = page_links.find_all("a") page_links = [link["href"] for link in page_links] page_links = list(set(page_links)) page_links.sort() return page_links
class BingPage: def __init__(self, page): self.soup = BeautifulSoup(page, 'html.parser') def next_url(self): url = self.soup.findChild(class_='sb_pagN') return url.get('href') if url else None def get_results(self): results = self.soup.find_all(class_='b_algo') infos = [] for result in results: infos.append(self._parse_result(result)) return infos def _parse_result(self, result): info = {'title': '', 'abstract': '', 'link': ''} info['title'] = self._text(result.h2) caption = result.findChild(class_='b_caption') if caption: info['abstract'] = self._text(caption.findChild('p')) info['link'] = self._text(caption.cite) return info def _text(self, node): return node.getText() if node else ''
def get_seats(ticket_href): seats_total = 0 seats_available = 0 try: req = requests.post(PATHE_URL + ticket_href) # Get special URL redirection req = requests.get(req.url + "/stoelen") except Exception: return seats_available, seats_total # Some error occurred during web request. if ticket_href in req.url: return seats_available, seats_total # Most likely you can't buy tickets anymore because the movie started. if req.status_code == 404: return seats_available, seats_total # Most likely you can't buy tickets anymore because the movie started. if req.status_code == 500: return seats_available, seats_total # Most likely there are no seats for this movie. (Known possibility: Drive-In Cinema) soup = BeautifulSoup(req.text, "html.parser") try: seats = soup.findChild("ul", {"id": "seats"}).findChildren("li") except AttributeError: # Unknown error.. print("\nError at:") print(req.status_code) print(req.url) print("") return seats_available, seats_total for seat in seats: seats_total += 1 try: seat["class"] except KeyError: seats_available += 1 return seats_available, seats_total
def stream(self, records): for record in records: soup = BeautifulSoup(record[self.textfield], self.parser) if self.find: if self.find_attrs is not None: soup = soup.find( self.find, literal_eval('{'+self.find_attrs+'}') ) else: soup = soup.find(self.find) if self.find_all: if self.find_all_attrs is not None: soup = soup.find_all( self.find_all, literal_eval('{'+self.find_all_attrs+'}') ) else: soup = soup.find_all(self.find_all) if self.find_child: if self.find_child_attrs is not None: soup = soup.findChild( self.find_child, literal_eval('{'+self.find_child_attrs+'}') ) else: soup = soup.findChild(self.find_child) if self.find_children: if self.find_children_attrs is not None: soup = soup.findChildren( self.find_children, literal_eval('{'+self.find_children_attrs+'}') ) else: soup = soup.findChildren(self.find_children) if self.get_text and not (self.find_all or self.find_children): record[self.get_text_label] = \ soup.get_text().decode('unicode_escape').encode('ascii','ignore') elif self.get_text and (self.find_all or self.find_children): record[self.get_text_label] = [ i.get_text().decode('unicode_escape').encode('ascii','ignore') for i in soup ] else: record['soup'] = soup yield record
def get_metadata(self, url_list: List[str]) -> List[Dict]: """ Metadata method ================ function to get meta data of news Arguments --------- url_list: list of str url to crawl metadata Returns ------- metadata: list of dict [{'press': press(str), 'date': publish date(str), 'time': publish time(str), 'title': news title(str), 'link': news url(str)},...] """ metadata = [] for url in url_list: # bs4 req = requests.get(self.root_url + url) # HTTP GET Request soup = BeautifulSoup(req.text, "html.parser") # 언론사 press_list = soup.find_all("span", {"class": "press"}) press_list = [press.text.strip() for press in press_list] # 발간일 p_datetime_list = soup.find_all("span", {"class": "wdate"}) p_datetime_list = [ p_datetime.text for p_datetime in p_datetime_list ] # 제목 및 링크 news_links = soup.findChild("div", {"class": "mainNewsList"}) news_links = news_links.find_all("a") news_titles = [link.text for link in news_links if link.text] news_links = [link["href"] for link in news_links] news_urls = [] for news_link in news_links: if news_link not in news_urls: news_urls.append(news_link) for press, p_datetime, title, link in zip(press_list, p_datetime_list, news_titles, news_urls): p_date, p_time = p_datetime.split() meta_dict = { "press": press, "date": p_date, "time": p_time, "title": title, "link": f"{self.root_url}{link}", } metadata.append(meta_dict) return metadata
def gz_thread(url, expected_price): req = gen_req(url) r = urllib2.urlopen(req).read() soup = BeautifulSoup(r, "html.parser") # print soup.prettify() div = soup.findChild('div', class_='t_fsz') content = div.findChild('font').string return content
def add_to_type_set(xml_obj: BeautifulSoup): """expects bf object""" tag_name = xml_obj.findChild().name global xml_types if tag_name not in xml_types: xml_types[tag_name] = xml_obj else: merge_xml(xml_obj, xml_types[tag_name])
def load_bundles(self): responseText = self.session.get( 'https://itch.io/my-purchases/bundles').text soup = BeautifulSoup(responseText, 'html.parser') self.bundles = {} for bundle in soup.findChild('section', attrs={ 'class': 'bundle_keys' }).findChildren('a'): self.bundles[ bundle.getText()] = 'https://itch.io' + bundle.get('href')
def get_paragraph(filename): b = BeautifulSoup(open(filename)) contents = b.findChild(attrs={"name": "navercast_div"}) def x(item): return isinstance(item, Tag) # filter string contents = filter(x, contents) return contents
def parse_recipe(self, response): #Retrieves all relevant data from ruled.me got_data = requests.get(response.url) html=got_data.content soup = BeautifulSoup(html, 'lxml') recipe = RecipeItem() #Hard-coded for now. consider parsing the url to get lunch recipe['time'] = "snacks" #get dataframe from html table. table = pd.read_html(html, header=0, index_col=0, flavor="bs4", encoding="utf-8") dataframe = []; for tab in table: dataframe.append(tab) #get metadata on recipe recipe['rawTable'] = dataframe meta = soup.find('div', attrs={'class': 'entry-content'}) meta_titleParent = soup.find('div', attrs={'class': 'articleTitle'}) meta_title = meta_titleParent.find('h1').text recipe['title'] = meta_title imageParent = soup.findChild('div', attrs={'class': 'postImage_f'}) image = imageParent.find('img')['src'] recipe['image_urls'] = [image] date_parent = soup.find('div', attrs={'class': 'articleData'}) date_dirty = date_parent.text date_final = date_dirty.split('on ')[1] recipe['date'] = date_final recipe['ingredients'] = [] yield recipe
def parsePage(c, page, idx): """ Parses a single page of the operone dictionary: Gets the page source: fixes problematic html tags(not closed or wrongly placed mostly) for every entry on the page: runs parseExceptions on the line takes the different parts of the line. simplifies and copys the transformed versions. writes the entry to the database """ page = urllib.request.urlopen(urllib.request.Request(operoneBaseUrl + page.get('href'))).read() page = page.decode('ISO-8859-1') # encoding of the operone pages page = html.unescape(page) #page = str(page) # cast to string (from stream) #page = html.unescape(page) # unescape greek letters page = fixBadHtml(page) # this automatically adds closing span tags at the end of a line if none are present pageSoup = BeautifulSoup(page, 'html.parser') lis = pageSoup.find_all('li') for element in lis: #print(element) # ok, we are getting additional translations or variations that belong to the word # for now we will be ignoring them by using only the first child. correctedLine = parseExceptions(str(element)) element = BeautifulSoup(correctedLine, 'html.parser') # get_text can strip whitespaces but since we need the comma stripped as well # it makes more sense to put both into one context. # vocab is the raw string of the entry vocab = element.findChild().findChild().get_text().strip(', ') # apply fix dict for vocab part for entry in VOCAB_FIX_DICT: vocab = vocab.replace(entry, VOCAB_FIX_DICT[entry]) # versions is a list of the different lookup words of the entry versions = [version.strip() for version in vocab.split(',')] # main is the first version - we will just assume that this is what we want ... main = versions[0] # alternate are all other versions concatenated by commas. alternate = ",".join(versions[1:]) # start index behind the first span. used to separate # the lookup word from the translation since there can be # greek letters and tags in the translation (otherwise we could just the text with recursive=False to eliminate text in tags) tlStartIndex = str(element).find('</span>') tlStartIndex += len('</span>') # ok, so here we take off the first part of the entry which # contains the greek word. Then we feed the remaining string into # a new BeautifulSoup instance and strip remaining tags in the translation # with the get_text() method. subText = BeautifulSoup((str(element))[tlStartIndex:], 'html.parser') translation = str(subText.get_text()).strip() #pageNum = idx rough = greek_to_ascii(greek_simplify(main), False) precise = greek_to_ascii(greek_simplify(main), True) c.execute('INSERT INTO operonedict VALUES(?, ?, ?, ?, ?)',(rough, precise, main, alternate, translation,)) #print(translation) return len(lis)
class EndnoteXML(object): def __init__(self,fname): if (fname): f=open(fname) self.content=re.sub(r'</?style.*?>','',f.read()) f.close() else: self.content="" self.soup=BeautifulSoup(self.content,'html.parser') self.records=self.soup.records.contents self.length=len(self.records) for i in range(self.length): self.checktag(i,'titles') self.checktag(i,'authors') self.checktag(i,'urls') if (self.records[i].find('related-urls') is None): self.addtag(i,'related-urls','',parent='urls') if (self.records[i].find('pdf-urls') is None): self.addtag(i,'pdf-urls','',parent='urls') self.checktag(i,'dates') self.setdoi(i,self.getdoi(i)) #def __repr__(self): # return self.soup.encode() def __str__(self): return self.soup.encode() def reset(self,fname): self.__init__(fname) def read(self,fname): self.__init__(fname) def reads(self,s): self.content=s self.soup=BeautifulSoup(self.content,'html.parser') self.records=self.soup.records.contents self.length=len(self.records) for i in range(self.length): self.checktag(i,'titles') self.checktag(i,'authors') self.checktag(i,'urls') if (self.records[i].find('related-urls') is None): self.addtag(i,'related-urls','',parent='urls') if (self.records[i].find('pdf-urls') is None): self.addtag(i,'pdf-urls','',parent='urls') self.checktag(i,'dates') self.setdoi(i,self.getdoi(i)) def writes(self,encoding='utf-8'): return self.soup.encode(encoding=encoding) def write(self,fname,encoding='utf-8'): f=open(fname,'w') f.write(self.writes(encoding=encoding)) f.close() def getrecord(self,num): if (num>=self.length): return None return self.records[num] def checktag(self,num,tag): if self.records[num].find(tag) is None: self.addtag(num,tag,value='') def addtag(self,num,tag,value=None,parent=None): '''value can be string, tag''' a=self.soup.new_tag(tag) if value: a.string=value if parent: self.records[num].find(parent).append(a) else: self.records[num].append(a) def gettag(self,num,tag,parent=None,obj=False): if parent: if self.records[num].find(parent): if self.records[num].find(parent).find(tag): if (obj): return self.records[num].find(parent).find(tag) else: return self.records[num].find(parent).find(tag).string else: return '' else: return '' else: if self.records[num].find(tag): if (obj): return self.records[num].find(tag) else: return self.records[num].find(tag).string else: return '' def settag(self,num,tag,value,parent=None): if parent: if self.records[num].find(parent): if self.records[num].find(parent).find(tag): self.records[num].find(parent).find(tag).string=value else: self.addtag(num,tag,parent=parent,value=value) else: a=self.soup.new_tag(tag) a.string=value self.addtag(num,parent,parent=None,value=a) else: if self.records[num].find(tag): self.records[num].find(tag).string=value else: self.addtag(num,tag,parent=None,value=value) def getpath(self): db=self.soup.findChild("database") if (db): return os.path.splitext(db['path'])[0]+'.Data' else: return "" def getdoi(self,num): doistr=self.gettag(num,"electronic-resource-num") if (doistr): doiindex=doistr.find('10.') else: doiindex=-1 if (doiindex >=0): return doistr[doiindex:].lower().strip() else: return "" def setdoi(self,num,value): self.settag(num,"electronic-resource-num",value) def gettitle(self,num): return self.gettag(num,"title") def settitle(self,num,value): self.settag(num,"title",value) def getjournalfull(self,num): return self.gettag(num,'secondary-title') def getyear(self,num): return self.gettag(num,'year','dates') def setyear(self,num,value): self.settag(num,'year',value,'dates') def getvolume(self,num): return self.gettag(num,'volume') def setvolume(self,num,value): self.settag(num,'volume',value) def getissue(self,num): return self.gettag(num,'number') def setissue(self,num,value): self.settag(num,'number',value) def getpages(self,num): return self.gettag(num,'pages') def setpages(self,num,value): self.settag(num,'pages',value) def getnotes(self,num): return self.gettag(num,'notes') def setnotes(self,num,value): self.settag(num,'notes',value) def geturl(self,num): urls=self.gettag(num,'related-urls',obj=True) if (urls): return [ i.string for i in urls.find_all('url') ] else: return [] def seturl(self,num,value): '''Note that it will clean all the url!''' if (self.soup.find('related-urls') is not None): urls=self.gettag(num,'related-urls',obj=True) if (urls): urls.clear() else: self.addtag(num,'related-urls',parent='urls') self.addtag(num,'url',value,'related-urls') def addurl(self,num,value,first=False): urls=self.gettag(num,'related-urls',obj=True) a=self.soup.new_tag('url') a.string=value if (urls): if (not first): urls.append(a) else: urls.insert(0,a) else: self.settag(num,'related-urls',a,'urls') def getpdf(self,num): urls=self.gettag(num,'pdf-urls',obj=True) if (urls): return [ i.string for i in urls.find_all('url') ] else: return [] def setpdf(self,num,value): '''Note that it will clean all the url!''' if (self.soup.find('pdf-urls') is not None): urls=self.gettag(num,'pdf-urls',obj=True) if (urls): urls.clear() else: self.addtag(num,'pdf-urls',parent='urls') self.addtag(num,'url',value,'pdf-urls') def setpdfs(self,num,value): '''Note that it will clean all the url!''' if (self.soup.find('pdf-urls') is not None): urls=self.gettag(num,'pdf-urls',obj=True) if (urls): urls.clear() else: self.addtag(num,'pdf-urls',parent='urls') for url in value: self.addtag(num,'url',url,'pdf-urls') def addpdf(self,num,value,first=False): urls=self.gettag(num,'pdf-urls',obj=True) a=self.soup.new_tag('url') a.string=value if (urls): if (not first): urls.append(a) else: urls.insert(0,a) else: self.addtag(num,'pdf-urls',a,'urls') def finddoi(self,num,prefix='',issn=''): title=self.gettitle(num) doi=DOI(self.getdoi(num)) if (not prefix): prefix = doi.split('/',1)[0] if doi else "" volume= self.getvolume(num) journal=self.getjournalfull(num) year=self.getyear(num) pages=self.getpages(num) self.cr=CRrecord() try: # The origin doi maybe true. Find in crossref if ( doi and self.cr.getfromdoi(doi,fullparse=False) and self.cr.doi): # Further check title if (strdiff(doi,self.cr.doi)>=0.85 and \ strsimilarity(normalizeString(title),normalizeString(self.cr.title))>0.75): return doi if( volume and pages ): ops=pages.split('-') crps=self.cr.pages.split('-') if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and volume==self.cr.volume): return doi if( year and pages ): ops=pages.split('-') crps=self.cr.pages.split('-') if (len(ops)>0 and len(crps)>0 and ops[0]==crps[0] and year==self.cr.year): return doi print "Origin DOI:",doi,"may be true but record strange..Try title" keyword=title+" "+journal+" "+year+" "+pages+" "+volume if (self.cr.getfromtitledoi(keyword,doi,year=year,limit=10,fullparse=False,prefix=prefix)): if (doi): if( prefix == self.cr.doi.split('/')[0] and strdiff(doi,self.cr.doi)>=0.85): return self.cr.doi else: print "Error for origin doi: "+doi+"; found: "+self.cr.doi return "" return self.cr.doi if (doi): if( strdiff(doi,self.cr.doi)>=0.85): return self.cr.doi else: print "Error2 for origin doi: "+doi+"; found: "+self.cr.doi return "" else: return "" except Exception as e: print "Error when find doi..",e,"\nRetry..." return self.finddoi(num,prefix=prefix,issn=issn) def preprocess(self): pass def cleannote(self,num): note=self.getnotes(num) notel=note.lower() if ("time" in notel): self.setnotes(num,notel[notel.find('time'):]) def cleanallpdf(self,exceptOAPDF=True): '''Clean PDF record or except OAPDF record''' for i in range(self.length): if (not exceptOAPDF): self.setpdf(i,'') else: for pdf in self.getpdf(i): if "internal-pdf://OAPDF/" in pdf: self.setpdf(i,pdf) break def process(self,fname="",cleannote=False,prefix='',issn='',start=0): epath=self.getpath() print "Output",self.length,"to",epath+os.sep+fname for i in range(start,self.length): try: #if (i%100 is 0): # print # print "Doing:",i+1, #else: # print i+1, pdfs=self.getpdf(i) urls=self.geturl(i) # Fast consider as record process before hasfound=False for pdf in pdfs: if "internal-pdf://OAPDF/" in pdf: hasfound=True doistr=self.gettag(i,"electronic-resource-num") if (doistr and len(doistr)>4 and doistr[:4]=='chk:'): doi=DOI(self.getdoi(i)) if doi: self.setdoi(i,"chk: "+doi) break if not hasfound: for url in urls: if "http://oapdf.sourceforge.net/cgi-bin/" in url: hasfound=True doistr=self.gettag(i,"electronic-resource-num") if (doistr and len(doistr)>4 and doistr[:4]=='chk:'): doi=DOI(self.getdoi(i)) if doi: self.setdoi(i,"chk: "+doi) break if hasfound: continue if (cleannote): self.cleannote(i) doistr=self.gettag(i,"electronic-resource-num") if (doistr and len(doistr)>4 and doistr[:4]=='chk:'): doi=DOI(self.getdoi(i)) else: doi=DOI(self.finddoi(i,prefix=prefix,issn=issn)) if doi: self.setdoi(i,"chk: "+doi) oapdflink="" if (doi and doi.is_oapdf()): oapdflink="http://oapdf.sourceforge.net/cgi-bin/doipage.cgi?doi="+doi newpdfs=[] for pdf in pdfs: pdfpath=pdf.replace("internal-pdf://",epath+os.sep+"PDF"+os.sep) relpath=pdf.replace("internal-pdf://","") # should never happen if (relpath == doi.quote()+".pdf"): newpdfs.append(pdf) continue if (doi): if (os.path.exists(pdfpath)): try: os.renames(pdfpath,epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf") newpdfs.append("internal-pdf://"+doi.quote()+".pdf") except: print "Can't rename:",pdf,'to',doi.quote()+".pdf" newpdfs.append(pdf) continue else: print "Maybe error for the record",doi,"with pdf path:",pdf,'; Try finding..', pdfdir=os.path.split(pdfpath)[0] if (os.path.exists(pdfdir)): fs=glob.glob(pdfdir+os.sep+'*.pdf') if (len(fs)==1): try: os.renames(fs[0],epath+os.sep+"PDF"+os.sep+doi.quote()+".pdf") newpdfs.append("internal-pdf://"+doi.quote()+".pdf") print "Find",fs[0],'and rename!' except: print "Can't rename:",fs[0],'to',doi.quote()+".pdf" newpdfs.append(pdf) continue else: print "Can't find.." newpdfs.append(pdf) continue else: newpdfs.append(pdf) continue else: print "Blank doi for file:",pdf newpdfs.append(pdf) continue if (oapdflink): newpdfs.append("internal-pdf://OAPDF/"+doi.quote()+".pdf") self.setpdfs(i,newpdfs) # Set the urls if (oapdflink and oapdflink not in urls): self.addurl(i,oapdflink,first=True) except Exception as e: print "Error at ", i, 'since: ',e #return 1 if fname: self.write(fname) return 0
def touchpage(origin='.', doilink='../doilink',pdf=True,force=False): # Use to save local page record if not os.path.exists(doilink): os.makedirs(doilink+os.sep+'pages') doilink=doilink.rstrip('/').rstrip('\\') sfurl="http://oapdf.sourceforge.net/cgi-bin/touchdoi.cgi?owner=oapdf" workdir=os.path.abspath(origin).rstrip('\\').rstrip('/') count=0 touchcount=1 # avoid submit when start forcesf=force # force to overwrite the exist doilink page if (pdf): result = (chain.from_iterable(glob.iglob(os.path.join(x[0], '10.*.pdf')) for x in os.walk(workdir))) else: result = (chain.from_iterable(glob.iglob(os.path.join(x[0], '10.*.html')) for x in os.walk(workdir))) toappend=[] newtouch=0 for f in result: if (touchcount%50==0): r=requests.post(sfurl,params={'dois':json.dumps(toappend)},timeout=120) if (r.status_code == 200): bs=BeautifulSoup(r.text,"html.parser") totaldid=bs.findChild('span',attrs={'id':'total'}) if totaldid and totaldid.text : newtouch+=int(totaldid.text) del toappend[:] else: print "Maybe Error when submit to SF-OAPDF.." sys.exit(1) count+=1 fname=filebasename(f) if (' ' in fname): print "File name has blank!",f os.renames(f,os.path.split(f)[0]+os.sep+fname.strip()+os.path.splitext(f)[1]) fname=fname.strip() doi=DOI(fname) if (doi): dirname=doilink+"/pages/"+doi.decompose(url=False, outdir=True) if (forcesf or not os.path.exists(dirname+fname+'.html')): touchcount+=1 toappend.append(doi) try: if (not os.path.exists(dirname)): os.makedirs(dirname) f=open(dirname+fname+'.html',"w") f.close() except WindowsError as e: print e except: print "Something error for file:",f else: print "File name may be error (Not DOI name):",fname r=requests.post(sfurl,params={'dois':json.dumps(toappend)},timeout=120) if (r.status_code == 200): bs=BeautifulSoup(r.text,"html.parser") totaldid=bs.findChild('span',attrs={'id':'total'}) if totaldid and totaldid.text : newtouch+=int(totaldid.text) del toappend[:] else: print "Maybe Error when submit to SF-OAPDF.." sys.exit(1) print "Process total file:",count,"; local touch new:",touchcount-1, "; remote touch:",newtouch
//This script is to download pics from funnie.st site import requests from bs4 import BeautifulSoup from progressbar import * link=raw_input('Paste the link here: ') res=requests.get(link) data=BeautifulSoup(res.content) Total_image=int(data.findChild('h3').string[-2:]) for pic in range(Total_image): res=requests.get(link) data=BeautifulSoup(res.content) next_image=data.findChildren('link') image_name=link[24:-3]+link[-2:-1] for img in next_image: if 'next' in img['rel']: link=img['href'] image=data.findChildren('img') for i in image: if 'jpg' in i['src']: r=requests.get(i['src']) size=float(r.headers['content-length']) mbSize = 1024*1024 #used for conversion to Mb TotalSize = (size)/mbSize widgets = ['Test: ', Percentage(), ' ', Bar(">"), ' ', ETA(), ' ', FileTransferSpeed()] progress = ProgressBar(widgets=widgets,maxval=TotalSize) progress.start()
from bs4 import BeautifulSoup, NavigableString, Tag import csv import json import time import sys pages = 5 page = 0 athletes = [] while page < pages: result = urllib2.urlopen("http://www.teamusa.org/athletes?pg=" + str(page)) soup = BeautifulSoup(result) # The list of athletes on a given page is in a <ul> with the class 'thumb-row athletes' content = soup.findChild(class_="thumb-row athletes") # Each <li> contains the info about an athlete for each in content.find_all("li"): athlete = {} # Get name try: name = "".join(each.findChild("h4").string) name = name.replace("\n", "").replace("\r", "").strip().encode("utf8") athlete["name"] = name except AttributeError: continue # Get link to profile try: