def fetch_theverge(): collection = MongoHelper("172.16.40.140", 27017, "ZDBThevergeCom", "pages") url = "https://www.theverge.com/camera-review" doc = [] html = HttpHelper.fetch(url) soup = BeautifulSoup(html[1], "lxml") total = 1 div = soup.find_all("div", attrs={"class": "c-compact-river"}) for i in div: try: a = i.find_all( "a", attrs={"class": "c-entry-box--compact__image-wrapper"}) for j in a: filename = HttpHelper.fetchAndSave(j['href'], "utf-8", "D:/pages/theverge.com") doc.append({ "filename": filename, "url": j['href'], "state": "fetched", "domain": "www.theverge.com" }) print(total) total += 1 except Exception as err: print(err) collection.insertMany(doc)
def fetch_techradar(): collection = MongoHelper("172.16.40.140", 27017, "ZDBTechradarCom", "pages") entrance = "https://www.techradar.com/reviews/car-tech?" doclist = [] total = 1 html = HttpHelper.fetch(entrance) soup = BeautifulSoup(html[1], "lxml") div = soup.find("div", attrs={"class": "listingResults"}) divitem = div.find_all( "div", attrs={"class": re.compile("^listingResult small result*")}) for i in divitem: try: a = i.find("a") url = a['href'] filename = HttpHelper.fetchAndSave(url, "utf-8", "D:/pages/techradar.com") doclist.append({ "filename": filename, "url": url, "state": "fetched", "domain": "www.techradar.com" }) print(total) total += 1 except Exception as err: print(err) collection.insertMany(doclist)
def fetch_gearbest(): collection = MongoHelper("172.16.40.140", 27017, "ZDBGearbestCom", "pages") entrance = "https://www.gearbest.com/health-care-c_11689/" pageNum = 1 total = 1 doc = [] while pageNum < 11 and entrance is not None: html = HttpHelper.fetch(entrance)[1] soup = BeautifulSoup(html, "lxml") proUl = soup.find_all("ul", attrs={"class": "clearfix js_seachResultList"}) for proList in proUl: li = proList.find_all("li") for i in li: try: photo = i.find_all( "a", attrs={ "class": "icon-loading gbGoodsItem_thumb js-selectItemd" }) for j in photo: url = j['href'] filename = HttpHelper.fetchAndSave( url, "utf-8", "D:/pages/gearbest.com") doc.append({ "filename": filename, "url": url, "state": "fetched" }) print(total) total += 1 except Exception as err: print(err) a = soup.find_all("a", attrs={"class": "pageNext"}) if a is None: entrance = None print("NO." + str(pageNum)) pageNum += 1 continue next = a[-1] entrance = next['href'] print("NO." + str(pageNum)) pageNum += 1 if doc != []: collection.insertMany(doc)
def check_product(): url = "https://mistinhaler.com/" while url is not None: title = "" statuscode, html = HttpHelper.fetch(url) soup = BeautifulSoup(html, "lxml") li_all = soup.find_all( "li", attrs={ "class": re.compile("^post-\d* product type-product status-publish") }) for li in li_all: # print(li['class'][0]) title = li.find("h2", attrs={ "class": "woocommerce-loop-product__title" }).text driver = webdriver.Chrome( 'C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe' ) driver.get("https://www.amazon.com/") driver.find_element_by_xpath( '//*[@id="twotabsearchtextbox"]').send_keys(title) submit = driver.find_element_by_xpath( "//*[@id=\"nav-search\"]/form/div[2]/div/input") ActionChains(driver).move_to_element(submit).click( submit).perform() html = driver.page_source.encode("utf-8") driver.close() soup2 = BeautifulSoup(html, "lxml") a = soup2.find_all("h1", attrs={"id": "noResultsTitle"}) if a != []: print(li['class'][0]) next_page = soup.find_all("a", attrs={"class": "next page-numbers"}) if next_page == []: url = None continue for i in next_page: url = i['href']
def fetch_banggood(): collection = MongoHelper("172.16.40.140", 27017, "ZDBBgoodCom", "pages") entrance = "https://www.banggood.com/Wholesale-Indoor-Lighting-c-2514.html" pageNum = 1 total = 1 doc = [] while pageNum < 11 and entrance is not None: html = HttpHelper.fetch(entrance)[1] soup = BeautifulSoup(html, 'lxml') proUl = soup.find("ul", attrs={"class": "goodlist_1"}) li = proUl.find_all("li") for i in li: try: photo = i.find_all("span", attrs={"class": "img"}) for j in photo: a = j.find("a") url = a['href'] filename = HttpHelper.fetchAndSave( url, 'utf-8', 'D:/pages/banggood.com') doc.append({ "filename": filename, "url": url, "state": "fetched" }) print(total) total += 1 except Exception as err: print(err) div = soup.find("div", attrs={"class": "page_num"}) next = div.find("a", attrs={"id": "listNextPage"}) if next is None: entrance = None print("NO." + str(pageNum)) pageNum += 1 continue entrance = next['href'] print("NO." + str(pageNum)) pageNum += 1 if doc != []: collection.insertMany(doc)
def fetch_tomtop(): collection = MongoHelper("172.16.40.140", 27017, "ZDBTomtopCom2", "pages") entrance = "https://www.tomtop.com/vehicle-infotainment-11035/" pageNum = 1 total = 1 doc = [] while pageNum < 11 and entrance is not None: html = HttpHelper.fetch(entrance)[1] soup = BeautifulSoup(html, 'lxml') proUl = soup.find("ul", attrs={"class": "lbBox categoryProductList"}) li = proUl.find_all("li") for i in li: try: photo = i.find_all("div", attrs={"class": "productImg"}) for j in photo: a = j.find("a") url = "https://www.tomtop.com" + a['href'] filename = HttpHelper.fetchAndSave(url, 'utf-8', 'D:/pages/tomtop.com') doc.append({ "filename": filename, "url": url, "state": "fetched" }) print(total) total += 1 except Exception as err: print(err) ul = soup.find("ul", attrs={"class": "lbBox pagingWarp"}) next = ul.find("li", attrs={"class": "lineBlock pageN pageClick"}) if next is None: entrance = None print("NO." + str(pageNum)) pageNum += 1 continue entrance = "https://www.tomtop.com" + next.find("a")['href'] print("NO." + str(pageNum)) pageNum += 1 if doc != []: collection.insertMany(doc)
def testSoup(): url = "https://www.drugs.com/comments/abobotulinumtoxina/" html = HttpHelper.fetch(url) soup = BeautifulSoup(html) #Remove all table in div tableList = soup.select("div.user-comment table") if (len(tableList) > 0): for table in tableList: table.decompose() # get div outer html divList = soup.select("div.user-comment") if (len(divList) > 0): reviewDivList = [] for div in divList: divHtml = str(div) divText = div.text print(divHtml) reviewDivList.append(divHtml)
def amazonfetch_detail(): doclist = [] total = 1 collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter") while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for x in doclist: if x['state'] != "fetched": continue try: status, html = HttpHelper.fetch(x['url']) soup = BeautifulSoup(html, "lxml") title = soup.find_all("span", attrs={"id": "productTitle"}) for i in title: text = i.text title = text.strip() a = soup.find_all("a", attrs={"id": "bylineInfo"}) # bylineInfo brand for i in a: href = i['href'] if re.match("^/{1}.*", href): href = "http://www.amazon.com" + href description = soup.find_all("ul", attrs={"class": "a-unordered-list a-vertical a-spacing-none"}) for i in description: doc = {"_id": x['_id'], "brand": x['brand'], "url": x['url'], "state": "pass", "price": x['price'] , "title": title, "brand_a": href, "inner_des": str(i)} collection.updateOne(doc) doc.clear() except Exception as err: print(err) continue print(total) total += 1
def fetch_dx(): collection = MongoHelper("172.16.40.140", 27017, "ZDBDxCom", "pages") entrance = "http://www.dx.com/c/computer-office-399/networking-314" pageNum = 1 total = 1 doc = [] while pageNum < 11 and entrance is not None: html = HttpHelper.fetch(entrance)[1] soup = BeautifulSoup(html, "lxml") proUl = soup.find_all("ul", attrs={"class": "productList subList"}) for proList in proUl: li = proList.find_all("li", attrs={"class": "c_cates"}) for i in li: try: photo = i.find("div", attrs={"class": "photo"}) url = "https://www.dx.com" + photo.find("a")['href'] filename = HttpHelper.fetchAndSave(url, "utf-8", "D:/pages/dx.com") doc.append({ "filename": filename, "url": url, "state": "fetched" }) print(total) total += 1 except Exception as err: print(err) a = soup.find_all("a", attrs={"class": "next"}) if a is None: entrance = None print("NO." + str(pageNum)) pageNum += 1 continue next = a[-1] entrance = "https://www.dx.com" + next['href'] print(entrance) print(pageNum) pageNum += 1 if doc != []: collection.insertMany(doc)
def updateAllArticle(MONGO_HOST, MONGO_DATABASE_NAME, IMPORT_URL): try: articleCollection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'article') total = 0 while True: articleList = articleCollection.nextPage(10) if len(articleList) == 0: break newArticleList = [] for article in articleList: total += 1 print("total=" + str(total)) url = article['url'] retry = 0 while True: retry += 1 if retry > 2: break statusCode, html = HttpHelper.fetch(url) if html != None and len(html) > 0: article['status'] = 0 # Check title, TODO print("update article ok, retry=" + str(retry) + ", url=" + url) break else: article['status'] = -1 print("update article error, retry=" + str(retry) + ", url=" + url) time.sleep(1) article['updateTime'] = datetime.now() articleCollection.updateOne(article) except Exception as err: print(err) finally: print("exit")
def fetch_fashionbeans(): collection = MongoHelper("172.16.40.140", 27017, "ZDBFashionbeansCom", "pages") entrance = "http://www.fashionbeans.com/category/mens-hairstyles/" doclist = [] total = 1 num = 1 while total < 200 and entrance is not None: html = HttpHelper.fetch(entrance) soup = BeautifulSoup(html[1], "lxml") div = soup.find("div", attrs={"id": "catmainBody"}) articles = div.find_all("div", attrs={"class": "catArticles"}) for i in articles: try: a = i.find("a", attrs={"class": "left relative"}) url = a['href'] filename = HttpHelper.fetchAndSave( url, "utf-8", "D:/pages/fashionbeans.com") doclist.append({ "filename": filename, "url": url, "state": "fetched", "domain": "www.fashionbeans.com" }) print(total) total += 1 except Exception as err: print(err) a = soup.find("a", attrs={"class": "nextLink right"}) print("页数:" + str(num)) if a is None: entrance = None continue num += 1 entrance = a['href'] if doclist != []: collection.insertMany(doclist)
def fetch_cnet(): collection = MongoHelper("172.16.40.140", 27017, "ZDBCnetCom", "pages") entrance = "https://www.cnet.com/topics/tablets/products/" doclist = [] total = 1 while total < 80 and entrance is not None: html = HttpHelper.fetch(entrance) soup = BeautifulSoup(html[1], "lxml") section1 = soup.find("section", attrs={"id": "dfllResults"}) section2 = section1.find_all( "section", attrs={"class": "col-3 searchItem product "}) for i in section2: try: a = i.find("a", attrs={"class": "imageWrap"}) url = "https://www.cnet.com" + a['href'] filename = HttpHelper.fetchAndSave(url, "utf-8", "D:/pages/cnet.com") doclist.append({ "filename": filename, "url": url, "state": "fetched", "domain": "www.cnet.com" }) print(total) total += 1 except Exception as err: print(err) a = soup.find_all("a", attrs={"class": "next"}) if a is None: entrance = None continue next = a[-1] entrance = "https://www.cnet.com" + next['href'] if doclist != []: collection.insertMany(doclist)
def fetch_digitaltrends(): collection = MongoHelper("172.16.40.140", 27017, "ZDBDigitaltrendsCom", "pages") entrance = "https://www.digitaltrends.com/tv-reviews/" doclist = [] total = 1 while total < 120 and entrance is not None: html = HttpHelper.fetch(entrance) soup = BeautifulSoup(html[1], "lxml") div = soup.find("div", attrs={"class": "m-products"}) item = div.find_all("div", attrs={"class": "item"}) for i in item: try: h3 = i.find("h3", attrs={"class": "title"}) url = h3.find("a")['href'] filename = HttpHelper.fetchAndSave( url, "utf-8", "D:/pages/digitaltrends.com") doclist.append({ "filename": filename, "url": url, "state": "fetched", "domain": "www.digitaltrends.com" }) print(total) total += 1 except Exception as err: print(err) a = soup.find_all("a", attrs={"class": "next page-numbers"}) if a is None: entrance = None continue next = a[-1] entrance = next['href'] if doclist != []: collection.insertMany(doclist)
def test(): collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "drugs", "url") doclist = [] print("dbcom") html = HttpHelper.fetch("https://www.drugs.com/alpha/a5.html") soup = BeautifulSoup(html[1]) # print(soup) list = soup.find_all('ul', attrs={"class": "doc-type-list"}) # list = soup.find_all('ul', attrs={"class": re.compile('doc-type*')}) # print(list) for i in list: li = i.find_all('a') for j in li: a = j.text # print(a) b = j['href'] # print(b) print('\n') doclist.append({"url": a, "title": b}) print(doclist) collection.insertMany(doclist)
def amazonfetch(): total = 1 goods = 1 url = "https://www.amazon.com/s/ref=sr_as_oo?rh=i%3Aaps%2Ck%3Ablood+pressure+monitor&keywords=blood+pressure+mon" \ "itor&ie=UTF8&qid=1527130301" collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter") '''excel = xlwt.Workbook() sheet = excel.add_sheet("Blood glucose meter") content = "brand" sheet.write(0,0,content) content = "url" sheet.write(0,1,content) row = 1''' doc = [] while url != None: statuscode, html = HttpHelper.fetch(url) soup = BeautifulSoup(html) for s in soup('script'): s.extract() #print(soup.prettify()) #return li_all = soup.find_all("li", attrs={"id":re.compile("^result_\d{1,2}")}) #print(li_all[3]) #return for li in li_all: print("正在检查第"+ str(goods) + "件商品") goods += 1 flag = li.find_all("p",attrs={"class":"acs-mn2-midwidgetHeader"}) if flag != []: print(flag) continue #print("flagok") a = li.find_all("a", attrs={"class": re.compile("^a-link-normal s-access-detail-page.*")}) if a == []: continue #print("aok") for i in a: url2 = i['href'] branddiv = li.find_all("div", attrs={"class": "a-row a-spacing-none"}) if branddiv == []: continue #print("brandok") brand = "" for i in branddiv: span = i.find_all("span", attrs={"class": "a-size-small a-color-secondary"}) if span == []: continue #print("spanok") for j in span: brand += j.text brand = brand[3:] p = li.find_all("span", attrs={"class": "sx-price-whole"}) if p == []: continue for i in p: price = i.text if price == []: continue #print("priceok") div = li.find_all("div", attrs={"class":"a-row a-spacing-mini"}) if div == []: continue #print("divok") for j in div: comment_all = j.find_all("a", attrs={"class":"a-size-small a-link-normal a-text-normal"}) if comment_all == []: continue #print("comok") for i in comment_all: comment = i.text print("price的类型是:") print(type(price)) print(type(comment)) price = price.replace(",", "") comment = comment.replace(",", "") print(price) print(comment) try: if isinstance(price, str): price1 = int(price) if isinstance(comment, str): comment1 = int(comment) except Exception as err: print(err) continue if price1 > 20 and price1 < 50 and comment1 > 100: print(brand) print("No." + str(total)) total +=1 url3 = url2 if re.match("^/{1}.*", url2): url3 = "https://www.amazon.com" + url2 '''sheet.write(row,0,brand) sheet.write(row,1,url3) row += 1''' doc.append({'brand': brand, 'url': url3, 'state': 'fetched', 'price': price + ".99"}) if total > 90: print("completed") #excel.save("D:/电商/test.xls") collection.insertMany(doc) return next_page = soup.find_all("a",attrs={"id":"pagnNextLink"}) if next_page == []: url = None continue for i in next_page: if re.match("^/{1}.*", i['href']): url = "https://www.amazon.com"+ i['href'] else: url = i['href'] print("not enough 90") # excel.save("D:/电商/test.xls") collection.insertMany(doc)
def test2(): ''' 以下过程为提取collection中的所有url,最后得到集合doclist :return: ''' collection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg", "supplement_copy", "url") nlp = NLPHelper() doclist = [] doc = [] # doc作为新key:attrlist的 while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) # print(doclist) total = 0 ''' 以下过程为解析每一个url ''' for i in doclist: if i['state'] != "FETCHED": # 只采集state为FETCHED的对象 continue # 来源为一号网站时 if i['cat'] == 1: html = HttpHelper.fetch(i['url']) soup = BeautifulSoup( html[1]) # html结构为[statusCode, html],所采集标号为1的元素 slist = soup.find_all("section") content = "" # 初始化content为一个空字符串 for j in slist: # j是section hlist = j.find_all("h2") # tag为h2的都是小标题 for x in hlist: # x是title title = x.text tlist = j.find_all("div", attrs={"class": "section-body"}) for y in tlist: # y是具体一个上面小标题对应的内容 doc.append({ "subtitle": title, "innerhtml": str(y), "text": y.text }) content += y.text # 总的一页内容是每一个小标题的内容之和 description = nlp.getSummary(content, wordCount=20) # 创建描述 ''' 加入要覆盖当前collection的doc ''' doc2 = { "_id": i['_id'], "cat": i['cat'], "fileName": i['fileName'], "url": i['url'], "host": i['host'], "state": "completed", "title": i['title'], "content": content, "description": description, "attrlist": doc } collection.updateOne(doc2) doc.clear() doc2.clear() # 每完成一次更新将两个doc清空 total += 1 print(total) # 打印出当前完成的document总数 # 来源二号网站,原理基本相同 elif i['cat'] == 2: html = HttpHelper.fetch(i['url']) soup = BeautifulSoup(html[1]) slist = soup.find_all( "div", attrs={"class": re.compile('field field-name-body*')}) content = "" for j in slist: hlist = j.find_all("h2") titlearr = [] # 用来存放当前页面的小标题,以便在插入时与innerhtml一一对应 for x in hlist: # x是title title = x.text titlearr.append(title) tlist = j.find_all("ul") index = 0 for y in tlist: if index > len(titlearr) - 1: # 防止索引越界 break doc.append({ "subtitle": str(titlearr[index]), "innerhtml": str(y), "text": y.text }) content += y.text index = index + 1 titlearr.clear() #print(content) description = nlp.getSummary(content, wordCount=20) doc2 = { "_id": i['_id'], "cat": i['cat'], "fileName": i['fileName'], "url": i['url'], "host": i['host'], "state": "completed", "title": i['title'], "content": content, "description": description, "attrlist": doc } collection.updateOne(doc2) doc.clear() doc2.clear() # print(j) total += 1 print(total)