def grab_all(self): self._local_setup() self.next_url = 'http://portal.ruc.edu.cn/cas/login?service=http%3A%2F%2Fportal.ruc.edu.cn%2Fidc%2Feducation%2Fselectcourses%2Fresultquery%2FResultQueryAction.do%3Fmethod%3DforwardAllQueryXkjg' self._login() r_cookies = requests.post(self.next_url, cookies=self.cookies, verify=False) content = r_cookies.content.decode(self.charset) self.cookies = r_cookies.cookies '''parser, start.''' ''' - get colleges''' strainer_colleges = SoupStrainer("select", id="condition_yx") soup_colleges = BeautifulSoup(r_cookies.content.decode('gbk'), parse_only=strainer_colleges) colleges = [ option['value'] for option in soup_colleges.select("option") if option['value'] ] colleges_name = [ option.get_text() for option in soup_colleges.select("option") if option['value'] ] pretty_print(colleges_name) print "{0} colleges.".format(len(colleges)) ''' - iter colleges''' total_courses = 0 for i, college in enumerate(colleges): courses = [] url_courses = 'http://portal.ruc.edu.cn/idc/education/selectcourses/resultquery/ResultQueryAction.do' '''get courses''' for j in range(1, 15): data = { 'method': "allJxb", 'condition_xnd': "2012-2013", 'condition_xq': "1", 'condition_yx': college.encode('gbk'), 'isNeedInitSQL': "true", 'ksj1': j, 'ksj2': j, } r_courses = requests.post(url_courses, data=data, cookies=self.cookies) content = r_courses.content.decode('gbk') soup_courses = BeautifulSoup(content) rows = soup_courses.find_all("row") if len(rows) == 1: continue for r in rows: teacher = r.select("xm")[0].get_text(strip=True).replace( '/', ',') time_and_location_texts = r.select("sksj > tagbr") lessons = self.get_lessons(time_and_location_texts) course = { 'original_id': r.select("jxbh")[0].get_text(strip=True), 'name': r.select("kcmc")[0].get_text(strip=True), 'credit': str(float(r.select("xf")[0].get_text(strip=True))), 'teacher': teacher, 'lessons': lessons, } courses.append(course) print "#{0} {1}: {2} courses.".format( i, colleges_name[i].encode('utf8'), len(courses)) if len(courses) == 0: continue total_courses += len(courses) output_dir = os.path.join(os.path.dirname(__file__), 'ruc') if not os.path.exists(output_dir): os.makedirs(output_dir) if courses != []: with open(os.path.join(output_dir, colleges_name[i] + '.yaml'), 'w') as yaml_file: yaml_file.write(pretty_format(courses)) print "Done! Totally exported {0} courses.".format(total_courses)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from bs4 import BeautifulSoup, SoupStrainer # from selenium import webdriver import requests, re ########## ## http://stackoverflow.com/questions/25539330/speeding-up-beautifulsoup session = requests.Session() response = session.get( "https://www.treasury.gov/resource-center/sanctions/OFAC-Enforcement/Pages/OFAC-Recent-Actions.aspx" ) # strainer = SoupStrainer("table") strainer = SoupStrainer("table", {"class": "ms-rteTable-default"}) soup = BeautifulSoup(response.content, "lxml", parse_only=strainer) # print(soup.get_text) row_data = [] for row in soup.find_all("tr"): temp = [] cols = row.find_all("td") cols = [ele.text.strip() for ele in cols] row_data.append(cols)
def __init__(self, page): only_body = SoupStrainer('body') self.dom = BeautifulSoup(page, 'html.parser', parse_only=only_body)
from urllib.parse import urljoin import matplotlib import requests from PIL import Image, ImageDraw, ImageFont from bs4 import BeautifulSoup, SoupStrainer matplotlib.use('Agg') from wordcloud import STOPWORDS, WordCloud LINK_URL = "https://wikis.nyu.edu/plugins/pagetree/naturalchildren.action?decorator=none&excerpt=false&sort=position" \ "&reverse=false&disableLinks=false&expandCurrent=true&hasRoot=true&pageId=20608012&treeId=0&startDepth=0" \ "&mobile=false&ancestors=68296313&ancestors=20608012&treePageId=68296315&_=1504714430704" only_wiki_links = SoupStrainer('div', id='children68296313-0') only_main_content = SoupStrainer('div', id="main-content") only_comments = SoupStrainer('div', id='comments-section') with open(os.path.join(os.path.dirname(__file__), 'stopwords')) as stopwords_file: STOPWORDS |= set(x.strip() for x in stopwords_file.readlines()) __all__ = ['save_word_cloud'] def get_links(session: requests.Session) -> Set[str]: link_page = session.get(LINK_URL) link_soup = BeautifulSoup(link_page.content, 'lxml',
soup_take=BeautifulSoup(this_is_html,'html.parser') # if i want to scrape only the small portions of the data print("'''''''''''''''''''''''''''''''") print('soup_take.get_text(): ', soup_take.get_text()) print("__________________soupstainer_____________________") # Now import soupstainer class from bs4 import SoupStrainer give_only=SoupStrainer(id="google") print(BeautifulSoup(this_is_html, 'html.parser', parse_only=give_only)) print("_______________________________________") give_only=SoupStrainer(id="lohit") print(BeautifulSoup(this_is_html, 'html.parser', parse_only=give_only)) print("_______________________________________") give_only=SoupStrainer(id="match") print(BeautifulSoup(this_is_html, 'html.parser', parse_only=give_only))
def insert_anchor(): paths = [ "globals.html", "filemanfiles.html", "Packages_Namespace_Mapping.html", "filemansubfiles.html", "routines.html", "packages.html" ] i = 0 while i < len(paths): stype = '' name = '' path_ = '' entry = OrderedDict() entries = OrderedDict() jsonEntries = [] validate = True header = {"name": "Methods", "isHeader": validate} jsonEntries.append(header) page = open(os.path.join(output, paths[i]), 'r').read() from bs4 import SoupStrainer if paths[i] in 'packages.html': stype = 'Package' elif paths[i] in 'routines.html': stype = 'Method' elif paths[i] in 'globals.html': stype = 'Global' elif paths[i] in 'filemanfiles.html': stype = 'File' list_fileman = [] bsFile = bs(page, parse_only=SoupStrainer('td')) for a in bsFile.find_all('a'): text_ = '' entry = OrderedDict() add_path = '//apple_ref/cpp/Method/' print('Running filemanfiles') name = urllib.unquote(a.get('href')).encode('utf-8') text_ = text_ + a.text text = text_ path_ = urllib.unquote(name).encode('utf-8') if path_ in list_fileman: continue add_path += text list_fileman.append(path_) entry['name'] = text_ entry['path'] = add_path entry['entryType'] = stype jsonEntries.append(entry) entries['entries'] = jsonEntries with open(os.path.join(output, paths[i]) + ".dashtoc", "w") as json_file: json.dump(entries, json_file) json_file.close() try: if name is not '': cur.execute( 'INSERT INTO searchIndex(type, name, path) values(?, ?, ?)', (stype, text_, name + '#' + add_path)) print 'index already uploaded' except sqlite3.IntegrityError as err: print(err) # sqlite3.IntegrityError: column bar is not unique elif paths[i] in 'filemansubfiles.html': stype = 'File' list_fileman = [] bsFile = bs(page, parse_only=SoupStrainer('td')) for a in bsFile.find_all('a'): entry = OrderedDict() add_path = '//apple_ref/cpp/Method/' print('Running filemansubfiles') name = urllib.unquote(a.get('href')).encode('utf-8') path_ = urllib.unquote(name).encode('utf-8') if path_ in list_fileman: continue add_path += path_[:-len('.html')] list_fileman.append(path_) entry['name'] = name[:-len('.html')] entry['path'] = add_path entry['entryType'] = stype jsonEntries.append(entry) entries['entries'] = jsonEntries with open(os.path.join(output, paths[i]) + ".dashtoc", "w") as json_file: json.dump(entries, json_file) json_file.close() try: if name is not '': cur.execute( 'INSERT INTO searchIndex(type, name, path) values(?, ?, ?)', (stype, name[:-len('.html')], name + '#' + add_path)) print 'index already uploaded' except sqlite3.IntegrityError as err: print(err) else: stype = 'Namespace' list_fileman = [] bsFile = bs(page, parse_only=SoupStrainer('td')) for a in bsFile.find_all('a'): entry = OrderedDict() add_path = '//apple_ref/cpp/Method/' print('Running Namespaces_Packages_Mapping') name = urllib.unquote(a.get('href')).encode('utf-8') path_ = urllib.unquote(name).encode('utf-8') if path_ in list_fileman: continue add_path += path_[:-len('.html')] list_fileman.append(path_) entry['name'] = name[:-len('.html')] entry['path'] = add_path entry['entryType'] = stype jsonEntries.append(entry) entries['entries'] = jsonEntries with open(os.path.join(output, paths[i]) + ".dashtoc", "w") as json_file: json.dump(entries, json_file) json_file.close() try: if name is not '': cur.execute( 'INSERT INTO searchIndex(type, name, path) values(?, ?, ?)', (stype, name[:-len('.html')], name + '#' + add_path)) print 'index already uploaded' except sqlite3.IntegrityError as err: print(err) bsFile = bs(page, 'html5lib') for a in bsFile.find_all('a', attrs={'class': 'el'}): entry = OrderedDict() add_path_global = '' name = urllib.unquote(a.get('href')).encode('utf-8') text = a.text add_path = '//apple_ref/cpp/Method/' without_html = name[:-len('.html')] add_path_global += add_path + text add_path += urllib.unquote(without_html).encode('utf-8') entry["name"] = name[:-len('.html')] entry["path"] = add_path if stype is 'Global': entry['name'] = text entry['path'] = add_path_global entry["entryType"] = stype jsonEntries.append(entry) entries["entries"] = jsonEntries with open(os.path.join(output, paths[i]) + ".dashtoc", "w") as json_file: json.dump(entries, json_file) json_file.close() jsonEntries_index = [] entry = OrderedDict() entries_index = {} #each entry type node of main sub-html file header_index = {"name": stype, "isHeader": validate} jsonEntries_index.append(header_index) raw_data = open(os.path.join(output, name), 'r') dom = fromstring(raw_data.read()) indexmumps = dom.xpath('//p//span//a/@href') if not indexmumps: pass else: item = indexmumps[0] entry["path"] = indexmumps[0] entry["name"] = item[:-len('.html')] entry["entryType"] = stype entries_index["entries"] = entry # # # # ################################################################################## # # ############# Validating the xpath to DOM by each href tag into each html ######## # # ############# This should query DOM and validate stype for each Entry type ####### # # ################################################################################## list_index = [] for link in dom.xpath( '//td//a/@href' ): # select the url in href for all a tags(links) print('Index from ' + name + ' page: ' + link) entry = OrderedDict() if link in list_index: continue entry["name"] = link[:-len('.html')] entry["path"] = link list_index.append(link) if '#' in link: continue entry["entryType"] = stype jsonEntries_index.append(entry) entries_index["entries"] = jsonEntries_index with open(os.path.join(output, path_) + ".dashtoc", "w") as json_index: json.dump(entries_index, json_index) json_index.close() # ################################################################################## # ############# I want to parse and change the html with this anchor tag to ######## # ############# identify query and Tree Explorer Interface from Zeal ############### # ############# Application how??????? ############################################# # ################################################################################## ################################################################################## ############# Also I want to complete the html with the rest anchor tag ######## ############# and also the rest of the html document including ################# ############# with the "name", "path", and "entryType" included ############### ############# also save the databases ############################################ ################################################################################## try: if stype is 'Global': cur.execute( 'INSERT INTO searchIndex(type, name, path) values(?, ?, ?)', (stype, text, name + '#' + add_path_global)) continue cur.execute( 'INSERT INTO searchIndex(type, name, path) values(?, ?, ?)', (stype, name[:-len('.html')], name + '#' + add_path)) print 'index already uploaded' except sqlite3.IntegrityError as err: print(err) i += 1
def parse_html_links(self, html): return BeautifulSoup(html, parse_only=SoupStrainer('a'), features='html.parser')
#(ii) soup = Beautifulsoup(html_markup, "lxml") #(iii) soup = Beautifulsoup(html_markup, "lxml", parse_from=Soup-Strainer("a")) #The Beautiful Soup constructor plays an important part and we will explore some of the important parameter #her: #(i) markup: The first parameter passed to the constructor accepts a string or objects to be parsed #(ii) features: The name of the parser ot type of markup to be used for markup. The parser can be lxml, #lxml-xml, html.parser, or html15. If we just want to parse some HTML, we can simply pass the markup to #BeautifulSoup and it will use the appropriate parser installed accordingly. #(iii) parse_only: Accepts a bs4.SoupStrainer object, that is, only parts of the document matching the #SoupStrainer object will be used to parse. #In this example we will be creating the soupA object using lxml as a parser, along with the SoupStrainer #object tagsA-> parsing only <a>,that is, the elements or anchor tag of HTML tagsA = SoupStrainer("a") soupA = BeautifulSoup(html_doc, 'lxml', parse_only=tagsA) soup = BeautifulSoup(html_doc, 'lxml') #The .pretiffy() function returns a Unicode string, presents the string in a clean, formatted structure #that is easy to read soupA #print soupA.prettify() #print #Document-based elements (such as HTML tags) in a parsed tree can have various attributes with predefined #values. Verifying whether the element contains certain attributes can be handy when traversing the tree #Remember->soupA.a returns the first <a> element or tag found in the html_doc soupA.a.has_attr("class") soupA.a.has_attr("name")
else: # initialise loggin configs, note: only the latest lyric's log is saved as mode = 'w' logging.basicConfig( level=logging.DEBUG, format='%(asctime)s %(levelname)-8s\n\n%(message)s\n', datefmt='%a, %d %b %Y %H:%M:%S', filename='/tmp/ly.log', filemode='w') # get the first 7 links from DuckDuckGo search engine. res = urllib.request.urlopen('https://duckduckgo.com/html/?q=' + '+'.join(sys.argv[1:]) + '+lyrics azlyrics').read() soup = BeautifulSoup(res, 'html.parser', parse_only=SoupStrainer( 'a', {'class': 'result__snippet'})) results = soup.find_all('a', limit=7) visited = [] # get the recontructed 'https://www.azlyrics.com*' url if available. url_info = None for tag in results: parsed = urllib.parse.urlparse(tag['href']) temp = urllib.parse.parse_qs(parsed.query)['uddg'][0] visited.append(temp) # appending visited url for logging match = re.search('azlyrics..*\/lyrics', temp) if match: url_info = temp, URLS[match.group()] break if url_info: lyrics = get_lyrics(url_info)
def sources(self, data, hostDict, hostprDict): try: isMovie = (data['type'] == 'movie') episode = data.get('episode', '') pageURL = data['pageURL'] stringConstant = data['sConstant'] session = self._createSession(data['UA'], data['cookies']) xbmc.sleep(1200) r = self._sessionGET(pageURL, session) if not r.ok: self._logException('%s Sources page request failed' % data['type'].capitalize()) return None pageHTML = r.text timeStamp = self._getTimeStamp(pageHTML) # Get a HTML block with a list of host names and internal links to them. session.headers[ 'Referer'] = pageURL # Refer to this page that "we're on" right now to avoid suspicion. pageID = pageURL.rsplit('.', 1)[1] token = self._makeToken({'ts': timeStamp}, stringConstant) xbmc.sleep(200) serversHTML = self._getServers(pageID, timeStamp, token, session) # Go through the list of hosts and create a source entry for each. sources = [] tempTokenData = { 'ts': timeStamp, 'id': None, 'server': None, 'update': '0' } baseInfoURL = self.BASE_URL + self.INFO_PATH soup = BeautifulSoup(serversHTML, 'html.parser', parse_only=SoupStrainer('div', { 'class': 'server row', 'data-id': True }, recursive=False)) for serverDIV in soup: tempTokenData['server'] = serverDIV['data-id'] hostName = serverDIV.label.text.strip().lower() hostName = self.DEBRID_HOSTS.get(hostName, hostName) for a in serverDIV.findAll('a', {'data-id': True}): # The text in the <a> tag can be the movie quality ("HDRip", "CAM" etc.) or for TV shows # it's the episode number with a one-zero-padding, like "09", for each episode in the season. label = a.text.lower().strip() hostID = a[ 'data-id'] # A string identifying a host embed to be retrieved from putlocker's servers. if isMovie or episode == str(int(label)): if isMovie: if 'hd' in label: quality = 'HD' else: quality = 'SD' if ('ts' not in label and 'cam' not in label) else 'CAM' else: quality = 'SD' tempTokenData['id'] = hostID tempToken = self._makeToken(tempTokenData, stringConstant) # Send data for the resolve() function below to use later, when the user plays an item. # We send the CF cookies from the session (instead of reusing them from data['cfCookies']) # because they might've changed. unresolvedData = { 'url': baseInfoURL % (timeStamp, tempToken, hostID, tempTokenData['server']), 'UA': data['UA'], 'cookies': session.cookies.get_dict(), 'referer': pageURL + '/' + hostID } sources.append({ 'source': hostName, 'quality': quality, 'language': 'en', 'url': unresolvedData, # Doesn't need to be a string, just repr()-able. 'direct': False, 'debridonly': False }) return sources except: self._logException() return None
__author__ = "Samaun Ibna Faiz" import json from urllib import request from bs4 import BeautifulSoup, SoupStrainer ################################################ # Important conference event dates/deadlines # ################################################ source = 'https://acl2020.org/' page_content = SoupStrainer('section', class_='page__content') soup = BeautifulSoup(request.urlopen(source), 'html.parser', parse_only=page_content) important_dates = [{ 'Event': (c := r.find_all('td'))[0].text, 'day': c[1].text.replace('\u2013', '-'), 'date': c[2].text.replace('\u2013', '-') } for r in soup.find('h2', { 'id': 'dates' }).find_next_sibling('center').select('table tbody tr')] print(json.dumps(important_dates, indent=4)) ################################################ # Accepted tutorials list # ################################################ source = 'https://acl2020.org/program/tutorials/'
def handle_one_page(self, driver): """重载父类方法,实现具体的爬虫操作""" url = self.entrance_url keyword = self.product_type driver.get(url) time.sleep(10) print "Inittial Page:", url # driver = self.submit_initial_url(driver, "//input[@type='text']", "gh-btn", keyword) # ebay #driver = self.submit_initial_url(driver, "//input[@type='search']", "search-button", keyword) driver.find_element_by_xpath("//input[@type='search']").clear() driver.find_element_by_xpath("//input[@type='search']").send_keys( keyword) # 获取按钮对象并点击按钮 # elem = driver.find_element_by_id(submit_key) # ebay ebuyer中使用该类型 elem = driver.find_element_by_xpath( '//*[@id="hFull"]/div[2]/div[1]/button') elem.click() time.sleep(20) # 需要暂停一两秒,防止页面未跳转 print "Get Crawer Home Page:", driver.current_url i = 0 while i < 500: # 获取当前网页html文档 response_html = self.get_htmlcontent(driver.current_url) try: if response_html.status_code is not 200: print "Get status_code, but Exception:response_html.status_code=", response_html.status_code break except: print "Exception:response_html.status_code=", response_html.status_code break # 仅提取内容部分的文档,方便解析提速 html_part_id_value = "lpBloc" # only_content_tags = SoupStrainer("ul", id=html_part_id_value) only_content_tags = SoupStrainer(id=html_part_id_value) html_part_content = BeautifulSoup( response_html.text, "html.parser", parse_only=only_content_tags).prettify() # 解析所需的所有链接 soup = BeautifulSoup(html_part_content, "html.parser", from_encoding="utf-8") # links = soup.find_all('a', class_='jsQs', href=re.compile(self.product_type, re.I)) links = soup.find_all('a', class_='jsQs') # , href=re.compile("Phone") for link in links: new_url = link['href'] self.handle_result_url(new_url, keyword, i) time.sleep(10) i = i + 1 # current_page = "a.pg curr" # print "The ", driver.find_element_by_css_selector(current_page).text, " Has Finished" try: # nextPage = "a.gspr.next" # driver.find_element_by_css_selector(nextPage).click() #ebay # driver.find_element_by_xpath("//*[@id='main-content']/div/div[1]/div[2]/div[1]/ul/li[6]/a").click() # Cdicount nextPage = "a.jsNxtPage.pgNext" driver.find_element_by_css_selector(nextPage).click() print driver.current_url time.sleep(20) except: print "Exception:Get Next page Fail", response_html.status_code break driver.quit() self.db.close()
bibtex = bibtex.strip() # figure out which Journal (if any) this is PDFURL = None match = re.search('[jJ]ournal\s*=\s*\{(.*?)\}', bibtex) Journal = '' if match: Journal = match.group(1) # get PDF for Wind Energy if Journal == 'Wind Energy' or Journal == 'Wind Energ.': PDFURL = 'http://onlinelibrary.wiley.com/doi/' + doi + '/pdf' # need to do additional parsing to get directly link to PDF r = requests.get(PDFURL) only_iframe = SoupStrainer('iframe', {'id': 'pdfDocument'}) webpage = BeautifulSoup(r.text, parse_only=only_iframe) if webpage.iframe is not None: PDFURL = webpage.iframe['src'] # [INSERT HERE: if you want to try to auto link a PDF from some other journal # follow the example above for Wind Energy. I've already parsed out the # journal name. You could potentially parse out other bits of info from the # BibTeX as search criteria. ] # show bibtex sys.stdout.write(bibtex) elif action == 'url': call(['open', 'http://dx.doi.org/' + doi])
def __init__(self, markup='lxml', is_async=True): self.is_async = is_async parser = self.get_parser() parser.add_argument('--include_comments', help='include comments', action='store_true') parser.add_argument('--comments_per_page', help='comments per page to be crawled', default=40, type=int) parser.add_argument('--gallery_id', help='specify gallery id such as: cat, dog', default='cat', type=str) parser.add_argument('--init_post_id', help='initial post_id to start crawling', default=0, type=int) parser.add_argument('--final_post_id', help='final post_id to stop crawling', default=10000, type=int) parser.add_argument('--forever', help='try crawling for forever', action='store_true') parser.add_argument('--timeout', help='crawling timeout per request', default=5, type=float) parser.add_argument( '--interval', help='crawling interval per request to prevent blocking', default=0.5, type=float) parser.add_argument( '--metadata_to_dict', help='return metadata into dictionary type', action='store_true', ) parser.add_argument('--filename', help="filename to be saved.", default="gallery.txt") self.options, _ = parser.parse_known_args() self._session = requests.Session() self._markup = markup self._view_url = 'http://gall.dcinside.com/board/view' self._comment_view_url = 'http://gall.dcinside.com/board/view' self._current_post_id = self.options.init_post_id self._strainer = SoupStrainer( 'div', attrs={ 'class': [ 're_gall_top_1', # 제목, 글쓴이, 작성시각 'btn_recommend', # 추천, 비추천 'gallery_re_title', # 댓글 's_write', # 본문 ] }) # Custom header is required in order to request. self.header = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0' }
fifthDeclensionEndingsMFSg = ['es', 'ei', 'ei', 'em', 'e', 'es', 'e'] fifthDeclensionEndingsMFPl = ['es', 'erum', 'ebus', 'es', 'ebus', 'es', 'ebus'] client = MongoClient() db = client.LATIN_DICT #db.words.delete_many({}) #result = db.words.insert_one({ "puella" :["puella, puellae F"]}) for character in letters: http = httplib2.Http() status, response = http.request( 'http://latin-dictionary.net/list/letter/' + character) for counter, word in enumerate( BeautifulSoup(response, parseOnlyThese=SoupStrainer('li', {'class': 'word'}))): if counter % 200 == 0: print counter tmp = word.contents link = word.contents[0]['href'] #print link if len(tmp) == 2: words = re.split( '\s+', tmp[0].get_text().strip().replace(',', '').replace( '(', '').replace(')', '').replace('.', '')) wordType = tmp[1].strip() # handle nouns if wordType == 'n': if len(words) != 2: print words
'Connection attempted a redirect (fcrov_data.py). Trying again in 2 minutes...' ) t = time.ctime() print(t) time.sleep(120.0 - ((time.time() - starttime) % 120.0)) else: print('Connection successful (fcrov_data.py).') ########## Begin processing response from HTTP request # Create filter with SoupStrainer to limit parsing to main div | This id may change, watch out res_filter = SoupStrainer('div', {'id': 'gems_results'}) # Grab the strained soup soup = BeautifulSoup(res.content, 'lxml', parse_only=res_filter) ########### Cooking soup / breaking down content soup.p.wrap(soup.new_tag("table")) soup.p.wrap(soup.new_tag("tr")) soup.p.wrap(soup.new_tag("td")) # Create an array of tag attributes to remove REMOVE_ATTRIBUTES = [ 'style', 'style', 'class', 'border', 'align', 'valign', 'cellpadding', 'cellspacing', 'colspan', 'width' ]
def list_contents2(self): if DEBUG: self.log('content_list2()') if self.parameters('key') == 'showing': page_data = fetch(SHOWING_URL).text tlink = SoupStrainer('div', {'id': 'main'}) else: year, month, _ = datetime.date.today().isoformat().split('-') page_data = '' nyear = int(year) for i in range(4): nmonth = int(month) + i if nmonth > 12: nmonth = nmonth - 12 nyear = int(year) + 1 url = COMING_URL.format(nyear, nmonth) page_data += fetch(url).text tlink = SoupStrainer('div', {'class': 'list detail'}) mdiv = BeautifulSoup(page_data, "html.parser", parse_only=tlink) videos = mdiv.find_all('table') h = html_parser.HTMLParser() for video in videos: vdiv = video.find('a', {'itemprop': 'trailer'}) if vdiv: videoId = vdiv.get('href').split('?')[0].split('/')[-1] plot = h.unescape(video.find(class_='outline').text).strip() tdiv = video.find(class_='image') icon = tdiv.find('img')['src'] title = tdiv.find('img')['title'] # imdb = tdiv.find('a')['href'].split('/')[-2] poster = icon.split('_')[0] + 'jpg' infos = video.find_all(class_='txt-block') director = [] directors = infos[0].find_all('a') for name in directors: director.append(name.text) cast = [] stars = infos[1].find_all('a') for name in stars: cast.append(name.text) labels = {'title': title, 'plot': plot, # 'imdbnumber': imdb, 'director': director, 'cast': cast} try: year = int(re.findall(r'\((\d{4})', title)[0]) title = re.sub(r'\s\(\d{4}\)', '', title) labels.update({'title': title, 'year': year}) except IndexError: pass listitem = xbmcgui.ListItem(title) listitem.setArt({'thumb': poster, 'icon': icon, 'poster': poster, 'fanart': _fanart}) listitem.setInfo(type='video', infoLabels=labels) listitem.setProperty('IsPlayable', 'true') url = sys.argv[0] + '?' + urllib.parse.urlencode({'action': 'play', 'videoid': videoId}) xbmcplugin.addDirectoryItem(int(sys.argv[1]), url, listitem, False) # Sort methods and content type... xbmcplugin.setContent(int(sys.argv[1]), 'movies') xbmcplugin.addSortMethod(int(sys.argv[1]), xbmcplugin.SORT_METHOD_UNSORTED) xbmcplugin.addSortMethod(int(sys.argv[1]), xbmcplugin.SORT_METHOD_VIDEO_TITLE) if force_mode: xbmc.executebuiltin('Container.SetViewMode({})'.format(view_mode)) # End of directory... xbmcplugin.endOfDirectory(int(sys.argv[1]), True)
import requests from bs4 import BeautifulSoup,SoupStrainer url = 'http://en.wikipedia.org/wiki/Category:Crimes' strain = SoupStrainer(id='mw-pages') soup = BeautifulSoup(requests.get(url).text, parse_only=strain) links = soup.find_all('a') weird_shit = list(set([u'L\xe8se-majest\xe9', u'learn more', u"1788 Doctors' Riot", u'EAFCT', u'Qatl',u'TWOC',])) crimes = sorted([ link.text for link in links if len(link.text) > 0 and link.text not in weird_shit ]) with open('crimes.txt', 'w') as f: f.write('\n'.join(crime for crime in crimes))
url= 'https://shop.tcgplayer.com/productcatalog/product/getpricetable?' 'captureFeaturedSellerData=True&pageSize=100&productId={0}'. format(product['productId']), headers={ 'User-Agent': 'Mozilla/5.0', 'Authorization': "Bearer {0}".format(token) }).text except Exception: continue # Creates a BeautifulSoup object with the retrieved HTML, then does find to get result set listings = BeautifulSoup( response, 'html.parser', parse_only=SoupStrainer("script", attrs={'type': 'text/javascript'})).find_all("script") if listings: product_listings = [] listings.pop(0) for listing in listings: try: result = listing.contents[0].split('\r\n') this_listing = {} # the string manipulation of these items assumes standard format where the desired item appears after a colon # and is formatted as "<desired item>", html unescape takes care of escape sequences, however since the # content is in a string format it leaves behind the leading \\, so this also assumes that no strings will # purposefully have a \\ in them, and removes all instances of \\ from strings for item in result: if item.find('"set_name":') > 0:
def main(OutputFileName="DITCourseList.csv", FileDelimiter=";", GetCoursesFromURL='http://www.dit.ie/catalogue/Programmes/Search', BaseURL='http://www.dit.ie', WebPageLoadDelay=10): # # Create files to store the output in (w)rite mode and add the header to the FileDelimiter specified in the function parameters MyCSVFile = open(OutputFileName, "wb") CourseList = csv.writer(MyCSVFile, delimiter=FileDelimiter) # This strainer is used to only import the table in the search page TableStrainer = SoupStrainer("table") # This strainer is used to only import the div containing the programme/module details on the individual pages ProgModDetailsStrainer = SoupStrainer("div",id="progmod_detail") ProgContentStrainer = SoupStrainer("div", class_="progmod_content") URLToParse = GetCoursesFromURL #Create a dictionary for the programme tabs ProgTabs = [] ProgTabsContent="" ModuleText ='' # Open the webpage using WebContent = requests.get(URLToParse,timeout=WebPageLoadDelay) #Parse the content using soup but only parse the table tags DITTable = BeautifulSoup(WebContent.text, "html.parser",parse_only=TableStrainer) #print DITTable.prettify(formatter="html") CourseList.writerow(['Dept', 'link', 'CourseName','CourseAward', 'CourseCode','CourseLevel', 'CourseDelivery', 'Duration', 'CourseNFQLevel']) #Get the rows in the table rows = DITTable.find_all('tr') for row in rows: data = row.find_all("td") # Var = data[index].get_text() returns the Unicode text of the cell i.e the contents wrapped in a unicode string CourseTitle = str(data[0].get_text()) CourseLink = BaseURL + str(data[0].find('a').get('href')) CourseCode = data[1].get_text() CourseLevel = data[2].get_text() CourseAward= data[3].get_text() #Replace Level with a blank string, then strip all the extra whitespace from the string leaving just the NQAI number value CourseNQAI = replace(str(data[4].get_text()),"Level",'').strip() CourseMode = data[5].get_text() CourseLength = data[6].get_text() CourseSchool = data[7].get_text() #print("Writing to file ",CourseSchool,CourseLink,CourseTitle,CourseAward,CourseCode,CourseLevel,CourseMode,CourseLength,CourseNQAI) CourseList.writerow([CourseSchool,CourseLink,CourseTitle,CourseAward,CourseCode,CourseLevel,CourseMode,CourseLength,CourseNQAI]) #Push the changes from buffer to disk for the csv file so the csv file will always be up to date even if the file hasn't been parsed already MyCSVFile.flush() FileNameToWrite = CourseCode+".html" #If the file doesn't exist already in the current directory then build it if not os.path.isfile(FileNameToWrite): #Get the text data for the programme with requests.Session() as WebSession: ProgContent = WebSession.get(CourseLink,timeout=WebPageLoadDelay) #Parse the contents of the programme page but strain it so only the relevant details are left ProgSoup = BeautifulSoup(ProgContent.text,"html.parser",parse_only=ProgModDetailsStrainer) #print(ProgSoup.prettify(formatter="html")) print("Processing ",CourseLink, " now...") #Open the file where the text will be saved MyHTMLFile = codecs.open(FileNameToWrite, "w",encoding='utf-8') HeaderText = "<h1>Text for Course "+CourseCode +" "+ CourseTitle +" </h1>" MyHTMLFile.write(HeaderText) MyHTMLFile.write(CourseLink) #If the tab dictionary is empty if not ProgTabs: #Get the programme tabs urls ProgTabs = get_navi_tabs(ProgSoup) #Get the separate tabs for this programme print(ProgTabs) for Tab in ProgTabs: #print(Tab) TabUrl = CourseLink + str(Tab) response = WebSession.get(TabUrl) print("TabURL----",response," for ", TabUrl) print(response) #ProgContentTabs = urllib2.urlopen(TabUrl) print("Processing ", Tab ," for course", CourseTitle) ProgContent = BeautifulSoup(response.text,"html.parser",parse_only=ProgContentStrainer) #Create a header based off the tab value and write it to the file HeaderText = str(Tab).replace("?tab=", '').strip() print("Adding ",HeaderText,"to the file for ",CourseTitle) HeaderText = "<h2>" + HeaderText + "</h2>" MyHTMLFile.write(HeaderText) #If the tab is the Programme Structure tab if "Programme Structure" in TabUrl: print("Getting the module contents for ",CourseTitle, "on ",TabUrl) #ModuleText = ParseModulePages(ProgContent, TabUrl,ProgModDetailsStrainer,BaseURL) #ProgTabsContent ="<div id=" +"moduleContent" +" >" + ModuleText + "</div>" #get the module urls and parse them for Modulelink in ProgContent.findAll('a'): FullLink = str(BaseURL + Modulelink.get('href')) print("Processing the module url by calling a function..", FullLink) ModuleText = ModuleText + ParseModulePages(FullLink,ProgModDetailsStrainer,ProgContentStrainer,WebPageLoadDelay, BaseURL='http://www.dit.ie') # Now outside the loop write the module text to the file MyHTMLFile.write(ModuleText) else: #print(ProgContent.prettify(formatter="html")) ProgTabsContent = ProgContent.prettify(formatter="html") #Write the contents to the tab after wrapping it in a ProgTabsContent = "<div id="+str(CourseCode)+" >" + ProgTabsContent +"</div>" MyHTMLFile.write(ProgTabsContent) MyHTMLFile.close #Clear the module text and ProgTabsContent before the next iteration of the loop ModuleText ='' ProgTabsContent ='' else: # The file by that name already exists (Used to overcome the timeouts for requests after about 250 files were downloaded and lets me build up the documents in batches print(FileNameToWrite," already exists so not processing it again") # Close the csv file print('File', MyCSVFile.name ,' closed') MyCSVFile.close #MyHTMLFile.close() # Exit successfully sys.exit(0)
def _scrape_xratescom_exchange_rates(url: str) -> Dict[Asset, Price]: """ Scrapes x-rates.com website for the exchange rates tables May raise: - RemoteError if we can't query x-rates.com """ log.debug(f'Querying x-rates.com stats: {url}') prices = {} try: response = requests.get(url=url, timeout=DEFAULT_TIMEOUT_TUPLE) except requests.exceptions.RequestException as e: raise RemoteError(f'x-rates.com request {url} failed due to {str(e)}') from e if response.status_code != 200: raise RemoteError( f'x-rates.com request {url} failed with code: {response.status_code}' f' and response: {response.text}', ) soup = BeautifulSoup( response.text, 'html.parser', parse_only=SoupStrainer('table', {'class': 'tablesorter ratesTable'}), ) if soup is None: raise RemoteError('Could not find <table> while parsing x-rates stats page') try: tr = soup.table.tbody.tr except AttributeError as e: raise RemoteError('Could not find first <tr> while parsing x-rates.com page') from e while tr is not None: secondtd = tr.select('td:nth-of-type(2)')[0] try: href = secondtd.a['href'] except (AttributeError, KeyError) as e: raise RemoteError('Could not find a href of 2nd td while parsing x-rates.com page') from e # noqa: E501 parts = href.split('to=') if len(parts) != 2: raise RemoteError(f'Could not find to= in {href} while parsing x-rates.com page') try: to_asset = Asset(parts[1]) if not to_asset.is_fiat(): raise ValueError except (UnknownAsset, ValueError): log.debug(f'Skipping {parts[1]} asset because its not a known fiat asset while parsing x-rates.com page') # noqa: E501 tr = tr.find_next_sibling() continue try: price = deserialize_price(secondtd.a.text) except DeserializationError as e: log.debug(f'Could not parse x-rates.com rate of {to_asset.identifier} due to {str(e)}. Skipping ...') # noqa: E501 tr = tr.find_next_sibling() continue prices[to_asset] = price tr = tr.find_next_sibling() return prices
worker_statement, [worker_values[0], worker_values[1], worker_values[2], worker_values[3]]) #Begin parsing Hits #Find dates to update cur.execute( """SELECT DISTINCT date FROM hitdb WHERE status NOT IN ('Paid','Rejected') AND workerID = %s ORDER BY date;""", [worker_ID]) pending_hits_list = cur.fetchall() pending_date_list = [] pending_link_list = [] pending_status = br.open('https://www.mturk.com/mturk/status') status_soup = pending_status.read() status_soup = BeautifulSoup(status_soup, parse_only=SoupStrainer('a')) def gather_status_links(): for pending_date in pending_hits_list: pending_date = str(pending_date[0]) hitattr = pending_date.split("-") dateswap = hitattr[1] + hitattr[2] + hitattr[0] pending_date_list.append(dateswap) for pending_link in status_soup: if pending_link.has_attr('href'): if "statusdetail?encodedDate" in pending_link[ 'href'] and pending_link['href'].split('=')[-1] > max( pending_date_list): pending_link_list.append(pending_link['href'])
if search_type not in ('index', 'url', 'fix', 'all'): print('Search type must be index, url, fix, or all') sys.exit(1) if search_type in ('index', 'url') and len(sys.argv) < 3: print('Input url') sys.exit(1) # -- globals domain_base = 'https://tvtropes.org' uri_base = '/pmwiki/pmwiki.php/' atoz = re.compile('Tropes(.|No)(To.)*$') strainer = SoupStrainer('div', {'id': 'main-article'}) wanted_groups = ( "Animation", "Anime", "AudioPlay", "ComicBook", "ComicStrip", "Disney", "Film", "Franchise", "LetsPlay", "LightNovel", "Literature", "Machinima", "Manga", "Manhua", "Manhwa", "Music", "Podcast", "Radio", "Series", "Theatre", "VideoGame", "VisualNovel", "WebAnimation", "Webcomic", "WebOriginal", "WebVideo", "WesternAnimation" ) sleep_delay = 0.5 # -- lower_wanted_groups = tuple([g.lower() for g in wanted_groups]) cp1252 = { # from http://www.microsoft.com/typography/unicode/1252.htm u"\x80": u"\u20AC", # EURO SIGN
import requests from bs4 import BeautifulSoup, SoupStrainer import alfred import sys # from common import waitForPeriodInQuery # get query from user # query = waitForPeriodInQuery('Search AIAA Aerospace Research Central', 'aiaa.png') query = sys.argv[1] # grab search data params = {'searchText': query, 'pageSize': 10} r = requests.get('http://arc.aiaa.org/action/doSearch', params=params) only_table = SoupStrainer('table', 'articleEntry') articles = BeautifulSoup(r.text, 'html.parser', parse_only=only_table) # soup = BeautifulSoup(r.text) # articles = soup.find_all('table', {'class': 'articleEntry'}) results = [] for art in articles: # get title title = art.find('div', {'class': 'art_title'}).contents[0] # get authors authorblock = art.find_all('a', {'class': 'entryAuthor'}) authorString = ''
def get_coin_des(self): if self.page_type == 'coin_des': self.filter = SoupStrainer("div", class_="artBox") self.renew_soup(self.filter) self.coin_des = self.soup.text
threads = list() for i in range(len(obs_list)): x = threading.Thread(target=crawlThread, args=(i,)) threads.append(x) x.start() for i in range(8): x = threading.Thread(target=chartThread, args=(i,)) threads.append(x) x.start() for t in threads: t.join() only_tables = SoupStrainer("table") for i in range(len(obs_list)): soup = BeautifulSoup(source[i], "lxml",parse_only=only_tables) #table_div = soup.find(id="content_weather") tables = soup.find_all("table") wt_table = tables[1] trs = wt_table.find_all('tr') currTr = trs[row] tds = currTr.find_all('td') if (i < len(obs_list)-1): tmp = tds[col].text if (tmp=="\xa0"): tmp = "0" res.append(tmp) else:
print('Enter genres of wallpaper you like') while True: genere = input() generes[genere] = [1, 1] print('add more(yes/no)') choice = input() if choice != 'yes': break file = open('temp.pickle', 'wb') pickle.dump(generes, file) file.close() file = open('genere_count_data.pickle', 'wb') print('counting the number of wallpapers in each genre') for genere in generes: url = 'https://wall.alphacoders.com/search.php?search=' + genere only_h1_tags = SoupStrainer('h1') source_code = urllib.request.urlopen(url) source_code = source_code_shortner(source_code, 700, 800) soup = BeautifulSoup(source_code, 'html.parser', parse_only=only_h1_tags) re_str = str(soup.contents[0]) result = re.search(' [0-9]* ', re_str) genere_wallpic_count[genere] = int(result.group()) print(str(result.group()) + ' wallpapers found in ' + genere) pickle.dump(genere_wallpic_count, file) file.close() print('Do you want to download initial wallpapers(It might take time)(yes/no)') input = input() if input == 'yes': # Download 10 wallpapers print('downloading initial wallpapers') for _ in range(10):
def __init__(self, *args, **kwargs): super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args, **kwargs) from bs4 import SoupStrainer self._strainer = SoupStrainer('table')
def __init__(self, page): only_main_content = SoupStrainer(id='main-content') self.dom = BeautifulSoup(page, 'html.parser', parse_only=only_main_content)
from bs4 import BeautifulSoup, SoupStrainer import utils import re game = 'League' url = 'https://leagueoflegends.fandom.com/wiki/Special:AllPages' baseurl = 'https://leagueoflegends.fandom.com' #%% Table of sections of all page list # Get links to sections of all pages list to comb through for page links # request page with target data page = requests.get(url) # filter the HTML content for the sections of page lists allpagesStrain = SoupStrainer(class_="allpageslist") allpagesSoup = BeautifulSoup(page.content, 'html.parser', parse_only=allpagesStrain) allpagesList = [] # add page directories to a list for link in allpagesSoup.find_all('a'): linkString = link.get('href') allpagesList.append(linkString) allpagesList = mylist = list(dict.fromkeys(allpagesList)) #%% Comb through the sections of the all page list to collect the pages pageList = [] utils.printProgressBar(0,len(allpagesList),"parsing {} of {}".format(0, len(allpagesList))) for idx,link in enumerate(allpagesList): utils.printProgressBar(idx,len(allpagesList),"parsing {} of {}".format(idx+1, len(allpagesList))) # request page