def download_data(url): response = uopen(url) page_html = response.read() response.close() soup = BeautifulSoup(page_html, 'html.parser') containers = soup.findAll('div', {'class': 'a-section a-spacing-medium'}) cnt = len(containers) for i in range(0, cnt): print(i + 1, '--------') try: print(containers[i].find( 'span', { 'class': 'a-size-medium a-color-base a-text-normal' }).text) except AttributeError: print("", end='') try: print("Rs.", (containers[i].find('span', { 'class': 'a-price-whole' }).text)) except AttributeError: print("", end='') try: print('Rating:', (containers[i].find('span', { 'class': 'a-icon-alt' }).text)) except AttributeError: print("", end='')
def getScore(self, your_team): request = uopen("https://www.cricbuzz.com") html_code = request.read() request.close() soup_page = soup(html_code, "html.parser") matches = soup_page.findAll("div", {"class": "cb-col cb-col-25 cb-mtch-blk"}) l = len(matches) c = c1 = 0 for i in range(l): if (your_team in matches[i].a["title"]): team = matches[i].a["title"] str1 = matches[i].div.text c1 = 1 for j in str1: if j.isdigit(): msg = matches[i].div.text c = 1 break if (c == 0): msg = "Match not Started Yet!!!" break if (c1 == 1): break if (c1 == 0): team = your_team msg = "Match Not Found!!!" app = Qt.QApplication(sys.argv) systemtray = Qt.QSystemTrayIcon(app) systemtray.show() systemtray.showMessage(team, msg)
def getrangking(isbn): """ 目的:使用正则表达式来拉取和返回当前的排名 步骤:根据 ISBN,创建与 Amazon 服务器通信的最终 URL,然后调用 urllib.request.urlopen 来打开这个地址. :param isbn: isbn 代码 :return: 排名 """ # page = uopen('%s%s' %(AMAZN, isbn)) url = '%s%s' % (AMZN, isbn) req = Request( url, headers={ 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' }) page = uopen(req) data = page.read() page.close() return REGEX.findall(data.decode())[0]
def create_overview_table_row(content_file): global pattern_list pattern_path = str(content_file.path) pattern_name = pattern_path.replace("src/patterns/","") raw_yaml = pattern_dir + pattern_name try: contents = uopen(raw_yaml).read() yaml_content = yaml.round_trip_load(contents,preserve_quotes=True) except: print(raw_yaml+ " could not be loaded!") return {"pattern": pattern_name, "done": "ERROR", "issue": None, "contr": 0} pname = get_pattern_name(pattern_name) gh_paths[pname] = pattern_path pattern_list[pname] = yaml_content if "contributors" in yaml_content.keys(): contributors = yaml_content["contributors"] else: contributors = [] pattern_curators[pname] = list(contributors) orcid = get_orcid() if orcid in contributors: processed = "Yes" else: processed = "No" issue = get_issue(pattern_name) return {"pattern": pattern_name, "done": processed, "issue": issue, "contr": len(list(contributors))}
def getRanking(isbn): setdefaulttimeout(10) # 设置socket超时 # print(f"{AMZN}{isbn}") page = uopen(f"{AMZN}{isbn}") data = page.read() page.close() return str(REGEX.findall(data)[0], "utf-8") # 转Unicode字符串
def getrangking(isbn): url = '%s%s' % (AMZN, isbn) req = Request(url, headers={ 'Connection': 'Keep-Alive', 'Accept': 'text/html, application/xhtml+xml, */*', 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko' }) with uopen(req) as page: return REGEX.findall(page.read().decode())[0]
def getRanking(isbn): try: page = uopen("%s%s" % (AMIZ, isbn)) # or str.format data = page.read() return REGEX.findall(data)[0] except: return None finally: if page in locals(): page.close()
def get_ranking(isbn): url = '{0}{1}'.format(amazon, isbn) page = uopen(url) data = page.read() page.close() # print(type(data)) # 将data转换为str data1 = data.decode('utf-8') result = regex.search(data1) # print(result) return result.group(1)
def download(num): data = get_json(num) # Save JSON if sys.version_info[0] >= 3: json_file = open('{0}.json'.format(num), 'w', encoding='utf-8') json_file.write(str(data)) else: json_file = open('%d.json' % num, 'w') json_file.write(str(data).encode('utf-8')) # Create HTML from template update_meta(data) meta_labels = { 'num': 'Number:', 'date': 'Published:', 'news': 'News:', 'link': 'Link:' } # Write HTML and image to file if sys.version_info[0] >= 3: file = open('{0}.html'.format(num), 'w', encoding='utf-8') file.write(TEMPLATES['head'].substitute(data)) for i in filter((lambda i: data[i] or False), meta_labels.keys()): file.write(TEMPLATES['entry'].substitute({ 'label': meta_labels[i], 'value': cgi.escape(str(data[i]), quote=True) })) file.write(TEMPLATES['tail'].substitute(data)) file.close() else: file = open('%d.html' % num, 'w') file.write((TEMPLATES['head'].substitute(data)).encode('utf-8')) for i in filter((lambda i: data[i] or False), meta_labels.keys()): file.write((TEMPLATES['entry'].substitute({ 'label': meta_labels[i], 'value': cgi.escape(str(data[i]), quote=True) })).encode('utf-8')) file.write((TEMPLATES['tail'].substitute(data)).encode('utf-8')) file.close() image = uopen(data['img']) try: img = open('{0}.png'.format(num), 'wb') except AttributeError: img = open('%d.png' % num, 'wb') img.write(image.read())
def getFoodInfo(url): try: newurl = FHandler.mobile_url + url tempdata = uopen(newurl).read() #tempdata = open('test_foodinfo.html','r').read() bsobj = bs(tempdata, 'html.parser') name = bsobj.body.find('div', attrs={ 'class': 'page-title' }).text.strip(' \\r\\t\\n').lower() except URLError: RESULT(" stopped, no connection?") FHandler.__debugDump(tempdata, '/tmp/debug_foodinfo.no_connection.html') exit(-1) except AttributeError: RESULT(" stopped, page is being redirected?") FHandler.__debugDump(tempdata, '/tmp/debug_foodinfo.redirected.html') exit(-1) food_tagline = bsobj.find('div', attrs={ 'class': 'page-info-text' }).text.strip(' \\r\\t\\n').lower() food_table = bsobj.find('div', attrs={'class': 'nutpanel'}) food_info = FHandler.handleFoodInfo(food_table) name = ' '.join(name.split()) yem = Yemek(name, food_info[0], food_info[1], food_info[2], food_info[3], food_info[4], food_info[5]) #import pdb; pdb.set_trace() section_titles = bsobj.body.find_all('div', {'class': 'section-title'}) portion_data = [ x for x in section_titles if x.text == "Common serving sizes" ] if len(portion_data) > 0: portion_data = portion_data[0] portion_info = FHandler.handlePortionData( portion_data.findNext('table')) if portion_info != -1: for key, val in portion_info: yem.portions.insert(key, val) return yem
def score(): output_score = [] my_url2 = uopen('https://www.matchendirect.fr/espagne') html_p2 = my_url2.read() my_url2.close() soup2 = BeautifulSoup(html_p2, "html.parser") container_score = soup2.findAll("div", {"id": "livescore"}) for con in container_score: team_con = con.findAll("td", {"class": "lm3"}) for x in team_con: score_c = x.findAll("span", {"class": "lm3_score"}) score = score_c[0].text output_score.append(score) return output_score
def horaire(): output_horaire = [] my_url3 = uopen('https://www.matchendirect.fr/espagne') html_p3 = my_url3.read() my_url3.close() soup3 = BeautifulSoup(html_p3, "html.parser") container_statut = soup3.findAll("div", {"id": "livescore"}) for cont in container_statut: team_cont = cont.findAll("td", {"class": "lm2 lm2_0"}) for x in team_cont: horaire = x.next_element.text.strip("-- : --") output_horaire.append(horaire) # print(output_horaire) return output_horaire
def scrap(): if request.method == "POST": searchString = request.form['content'].replace(" ", "") noc = int(request.form['numOfComments']) try: flipkart_search_url = r"https://www.flipkart.com/search?q=" + searchString page = uopen(flipkart_search_url) page_content = page.read() page.close() page_html = bs(page_content, "html.parser") boxes = page_html.findAll("div", {"class": "bhgxx2 col-12-12"}) boxes = boxes[2:] box = boxes[0] product_url = r"https://www.flipkart.com" + box.div.div.div.a[ 'href'] product = uopen(product_url) product_content = product.read() product.close() product_html = bs(product_content, "html.parser") reviews = product_html.findAll("div", {"class": "_3nrCtb"}) reviews_df = [] for i in range(noc): Heading = reviews[i].div.div.div.p.text comm = reviews[i].findAll("div", {"class": ""}) Content = comm[0].div.text reviews_df.append([Heading, Content]) reviews_df = pd.DataFrame(reviews_df, columns=["Heading", "Content"]) reviews_df = reviews_df.to_dict("records") print(reviews_df) return (render_template("result.html", reviews_df=reviews_df)) except Exception as e: return (str(e)) else: return "not post"
def statut(): output_statut = [] my_url3 = uopen('https://www.matchendirect.fr/espagne') html_p3 = my_url3.read() my_url3.close() soup3 = BeautifulSoup(html_p3, "html.parser") container_statut = soup3.findAll("div", {"id": "livescore"}) for cont in container_statut: team_cont = cont.findAll("td", {"class": "lm2 lm2_0"}) for x in team_cont: statut_c = x.span.decompose() statut = x.text output_statut.append(statut) return output_statut
def get_json(num): url = get_url(num) # Download JSON, retrying in case of an error while True: try: comic = uopen(url).read().decode() break except: raise # A crutch for a comic json with apparently an error in it if num == 971: comic = comic.replace("\u00e2\u0080\u0099", "'") # Open JSON file return json.loads(comic)
def download_data(url): response = uopen(url) page_html = response.read() response.close() soup = BeautifulSoup(page_html, 'html.parser') containers = soup.findAll("div", {'class': "_13oc-S"}) cnt = len(containers) for i in range(0, cnt): print(i + 1, '--------') try: print("Name:", containers[i].find("a", {'class': 's1Q9rs'}).text) except AttributeError: print("", end='') try: print("Name:", containers[i].find("div", { 'class': '_4rR01T' }).text) except AttributeError: print("", end='') try: print("It's price is ", containers[i].find("div", { 'class': '_30jeq3' }).text) except AttributeError: print("", end='') try: print("It's price is ", containers[i].find("div", { 'class': '_30jeq3 _1_WHN1' }).text) except AttributeError: print("", end='') try: print("Some description:", containers[i].find("li", { 'class': 'rgWa7D' }).text) except AttributeError: print("", end='') try: print("Some description:", containers[i].find("div", { 'class': '_3Djpdu' }).text) except AttributeError: print("")
def resMatch(): output = [] my_url = uopen('https://www.matchendirect.fr/espagne') html_p = my_url.read() my_url.close() soup = BeautifulSoup(html_p, "html.parser") container = soup.findAll("div", {"id": "livescore"}) for c in container: team_c = c.findAll("td", {"class": "lm3"}) for x in team_c: team = x.a["title"] regex_team = re.findall(r'[^Détail du match \:].*', team) score_c = x.findAll("span", {"class": "lm3_score"}) score = score_c[0].text output.append("".join(regex_team)) return output
def download(num): data = get_json(num) # Save JSON if sys.version_info[0] >= 3: json_file = open("{0}.json".format(num), "w", encoding="utf-8") json_file.write(str(data)) else: json_file = open("%d.json" % num, "w") json_file.write(str(data).encode("utf-8")) # Create HTML from template update_meta(data) meta_labels = {"num": "Number:", "date": "Published:", "news": "News:", "link": "Link:"} # Write HTML and image to file if sys.version_info[0] >= 3: file = open("{0}.html".format(num), "w", encoding="utf-8") file.write(TEMPLATES["head"].substitute(data)) for i in filter((lambda i: data[i] or False), meta_labels.keys()): file.write( TEMPLATES["entry"].substitute({"label": meta_labels[i], "value": cgi.escape(str(data[i]), quote=True)}) ) file.write(TEMPLATES["tail"].substitute(data)) file.close() else: file = open("%d.html" % num, "w") file.write((TEMPLATES["head"].substitute(data)).encode("utf-8")) for i in filter((lambda i: data[i] or False), meta_labels.keys()): file.write( ( TEMPLATES["entry"].substitute( {"label": meta_labels[i], "value": cgi.escape(str(data[i]), quote=True)} ) ).encode("utf-8") ) file.write((TEMPLATES["tail"].substitute(data)).encode("utf-8")) file.close() image = uopen(data["img"]) try: img = open("{0}.png".format(num), "wb") except AttributeError: img = open("%d.png" % num, "wb") img.write(image.read())
def get_ranking(isbn): url = '%s%s' % (AMZN, isbn) # headers = {'User-Agent': user_agent} # req = Request(url, headers=headers) print('request url:', url) try: # python3 需要加上context,否则报错<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] # certificate verify failed (_ssl.c:748)> page = uopen(url, timeout=30, context=context) # or use str.format() print('Open', url, 'success') data = page.read() # python3,必须要加这句,否则会报错:TypeError: cannot use a string pattern on a bytes-like object data = data.decode('utf-8') page.close() result = REGEX.findall(data) return result[0] except Exception as e: print(str(e)) return 'unknow'
def __init__(self, query, foodobj=0): INFO("\r\tChecking online...", end=' ') self.query = HTMLMethods.toHTMLChars(query) try: self.pagedata = uopen(FHandler.query_url + self.query).read() except URLError: print(" stopped, no connection?") exit(-1) # # offline saved # print(self.pagedata) # exit(0) # self.pagedata = open("test_sub.html").read() self.results = self.ParseResults() if foodobj == 0: self.found = self.resHandler() else: #Check current food obj against results list self.found = self.checkFoodHomology(foodobj)
def download(num): data = get_json(num) # Save JSON if sys.version_info[0] >= 3: json_file = open('{0}.json'.format(num), 'w', encoding='utf-8') json_file.write(str(data)) else: json_file = open('%d.json' % num, 'w') json_file.write(str(data).encode('utf-8')) # Create HTML from template update_meta(data) meta_labels = {'num': 'Number:', 'date': 'Published:', 'news': 'News:', 'link': 'Link:'} # Write HTML and image to file if sys.version_info[0] >= 3: file = open('{0}.html'.format(num), 'w', encoding='utf-8') file.write(TEMPLATES['head'].substitute(data)) for i in filter((lambda i: data[i] or False), meta_labels.keys()): file.write(TEMPLATES['entry'].substitute({'label': meta_labels[i], 'value': cgi.escape(str(data[i]), quote=True)})) file.write(TEMPLATES['tail'].substitute(data)) file.close() else: file = open('%d.html' % num, 'w') file.write((TEMPLATES['head'].substitute(data)).encode('utf-8')) for i in filter((lambda i: data[i] or False), meta_labels.keys()): file.write((TEMPLATES['entry'].substitute({'label': meta_labels[i], 'value': cgi.escape(str(data[i]), quote=True)})).encode('utf-8')) file.write((TEMPLATES['tail'].substitute(data)).encode('utf-8')) file.close() image = uopen(data['img']) try: img = open('{0}.png'.format(num), 'wb') except AttributeError: img = open('%d.png' % num, 'wb') img.write(image.read())
def scrapeSyl(link=""): uclient = uopen(link) read_html = uclient.read() uclient.close() ssoup = soup(read_html, "html.parser") table = ssoup.find("table", {"class": "sc_courselist"}) headers_soup = table.find_all("span", class_="courselistcomment areaheader") headers = [] for i in headers_soup: i = cleanScrape(i) if i != "": headers.append(i) # print(headers) slots = [] code_soup = table.find_all("a", class_="bubblelink code") codes = [] for c in code_soup: n = c.findParent(class_="codecol").findNextSibling("td") h = n.findNextSibling(class_="hourscol") c = cleanScrape(c) n = cleanScrape(n) if h != "" and h != None: h = cleanScrape(h) if h == None or h == "": h = "N/A" if c != "": c = c.replace(u"\xa0", u" ") # debug print cnh # print(f"{c}: {n} - {h}") newSlot = infoSlot(c, n, h) slots.append(newSlot) # print(slots) return headers, slots
from urllib.parse import urlparse as upar from urllib.request import urlopen as uopen from bs4 import BeautifulSoup as soup # Ignore SSL certificate errors ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE # using Amazon with keyword of laptop my_url = "https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%3Daps&field-keywords=laptop" try: #my_url = "https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=graphic+card&ignorear=0&N=-1&isNodeId=1" # opening up connection uClient = uopen(my_url, context=ctx) page_html = uClient.read() if page_html.getcode() != 200: print("Error on page", page_html.getcode()) uClient.close() except: print("Unable to retrieve or parse page") #continue #html parsing page_soup = soup(page_html, "html.parser") #graps each product: s-item-container is for Amazon, # while for newegg, it will be item-container #containers = page_soup.findAll("li",{"class":"s-result-item celwidget"}) containers = page_soup.findAll("div", {"class": "s-item-container"})
def getRanking(isbn): page = uopen('%s%s' % (AMZN, isbn)) data = page.read() page.close() return str(REGEX.findall(data)[0], 'utf-8')
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uopen my_url = 'https://www.footlocker.co.uk/en/men/_air-max-95?shoesize=7' client = uopen(my_url) page = client.read() client.close() page_soup = soup(page, "html.parser") grab_crep = page_soup.findAll("div", {"class": "fl-category--productlist--item"}) print("") for crp in grab_crep: print(crp.a.span.string.replace(" - Men Shoes", "")) print(crp.a.get('href')) print("") print("")
def getRanking(isbn): page = uopen('%s%s' % (AMZN, isbn)) # '{0}{1}'.format(AMZN, isbn)) for 2.6+ data = page.read() page.close() # print(data) return str(REGEX.findall(data)[0], 'utf-8')
from urllib.request import urlopen as uopen from bs4 import BeautifulSoup as soup import time graphicurl = "https://www.newegg.com/p/pl?N=100007709%20600030348&page=1" cpuurl = "https://www.newegg.com/Processors-Desktops/SubCategory/ID-343?Tid=7671" #Opening GPU site, reading contents, closing site graphic = uopen(graphicurl) graphicread = graphic.read() graphic.close() #Parsing site to HTML graphic_soup = soup(graphicread, "html.parser") #Grabs Product gcontainers = graphic_soup.findAll("div", {"class": "item-container"}) gcontainer = gcontainers[0] #Names and Opens File Gfilename = "GPU Products.csv" gf = open(Gfilename, "w") headers = "Brand, Product_name, Price, Shipping_Fee\n" gf.write(headers) for gcontainer in gcontainers: gbrand = gcontainer.div.div.a.img["title"] gtitle = gcontainer.findAll("a", {"class":"item-title"})
from bs4 import BeautifulSoup as bs from urllib.request import urlopen as uopen from urllib.request import Request as rq import csv with open('test','w') as new_file: test_writer = csv.writer(new_file,delimiter = ",") my_url = 'https://www.amazon.com/' req = rq(my_url, headers = {'User-Agent':'Mozilla/5.0'}) uClient = uopen(req) page_html = uClient.read() uClient.close() page_soup = bs(page_html, "html.parser") containers = page_soup.findAll('div') print(containers)
def get_ranking(is_bn): page = uopen('%s%s' % (AMZN, is_bn)) data = page.read() page.close() return REGEX.findall(data)[0]
import bs4 from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uopen url = "https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card" client = uopen(url) page_html = client.read() client.close() page = soup(page_html, "html.parser") containers = page.findAll("div", {"class": "item-container"}) len(containers) container = containers[0] print(container) container.div.div.a.img["title"] container.a.img["title"] #prices=page.findAll("li",{"class":"price-ship"}) #price=prices[0] shipping = container.findAll("li", {"class": "price-ship"}) shipping[0].text #for removing initial and final /r\r\n shipping[0].text.strip() #forming loop for container in containers: brand = container.div.div.a.img["title"] brand_info = container.a.img["title"] shipping = container.findAll("li", {"class": "price-ship"})
def getRanking(isbn): page = uopen('%s%s' % (AMZN, isbn)) # '{0}{1}'.format(AMZN, isbn)) for 2.6+ data = page.read() page.close() return str(REGEX.findall(data)[0], 'utf-8')
def getRanking(isbn): req = request.Request('%s%s' % (AMZN, isbn), {}, head) with uopen(req) as page: return str(REGEX.findall(page.read().decode('utf-8'))[0])
def getRanking(isbn): page = uopen('{0}{1}'.format(AMZN, isbn)) data = page.read() page.close() return REGEX.findall(data)[0]
from bs4 import BeautifulSoup as soup from urllib.request import urlopen as uopen my_url = 'https://www.medicalnewstoday.com/articles/321533.php' # data taken from # Opening URl and grabbing content Page = uopen(my_url) page_html = Page.read() Page.close() # HTML parsing page_soup = soup(page_html,"html.parser")#parses html data body = page_soup.find("div",{"class":"article_body"})#finds all div tags with class = article_body f = open("Could gut bacteria cause joint pain?.txt", "w") #opens a file and writes content to it f.write(body.text) f.close() # Close the file
def getRanking(isbn): with uopen('{0}{1}'.format(AMZN, isbn)) as page: return str(REGEX.findall(page.read())[0], 'utf-8')