def fetch_theverge(): collection = MongoHelper("172.16.40.140", 27017, "ZDBThevergeCom", "pages") url = "https://www.theverge.com/camera-review" doc = [] html = HttpHelper.fetch(url) soup = BeautifulSoup(html[1], "lxml") total = 1 div = soup.find_all("div", attrs={"class": "c-compact-river"}) for i in div: try: a = i.find_all( "a", attrs={"class": "c-entry-box--compact__image-wrapper"}) for j in a: filename = HttpHelper.fetchAndSave(j['href'], "utf-8", "D:/pages/theverge.com") doc.append({ "filename": filename, "url": j['href'], "state": "fetched", "domain": "www.theverge.com" }) print(total) total += 1 except Exception as err: print(err) collection.insertMany(doc)
def for_blank_des(): collection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg", "supplement_copy", "url") doclist = [] doc = [] nlp = NLPHelper total = 0 while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for i in doclist: if i['state'] == "built": if i['description'] != "": continue else: description = nlp.getSummary(i['content'], wordCount=35) print(i['title']) print(description) '''doc.append( {"_id": i['_id'], "cat": i['cat'], "fileName": i['fileName'], "url": i['url'], "host": i['host'], "state": "built", "title": i['title'], "content": i['content'], "description": description, "attrlist": i['attrlist'], "contenthtml": i['contenthtml']}) collection.updateOne(doc)''' total += 1 doc.clear() print(total)
def Mongo2Csv(): doc = [] try: pdCollection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter") total = 0 while True: pdList = pdCollection.nextPage(10) if pdList == None or len(pdList) == 0: break for pd in pdList: if pd['state'] != 'pass': continue newID = createPost(pd) if newID != None: doc.append({"_id": pd['_id'], "ID": newID, "brand": pd['brand'], "url": pd['url'], "state": "posted", "price": pd['price'], "title": pd['title'], "brand_a": pd['brand_a'] , "inner_des": pd['inner_des']}) print(doc[0]['ID']) pdCollection.updateOne(doc) doc.clear() print("create post ok") else: print("create post error") total += 1 print('total=' + str(total) + ', title=' + pd['title']) return print('Creawte all posts ok') except Exception as err: print(err)
def fetch_techradar(): collection = MongoHelper("172.16.40.140", 27017, "ZDBTechradarCom", "pages") entrance = "https://www.techradar.com/reviews/car-tech?" doclist = [] total = 1 html = HttpHelper.fetch(entrance) soup = BeautifulSoup(html[1], "lxml") div = soup.find("div", attrs={"class": "listingResults"}) divitem = div.find_all( "div", attrs={"class": re.compile("^listingResult small result*")}) for i in divitem: try: a = i.find("a") url = a['href'] filename = HttpHelper.fetchAndSave(url, "utf-8", "D:/pages/techradar.com") doclist.append({ "filename": filename, "url": url, "state": "fetched", "domain": "www.techradar.com" }) print(total) total += 1 except Exception as err: print(err) collection.insertMany(doclist)
def fetch_highsnobiety(): collection = MongoHelper("172.16.40.140", 27017, "ZDBHighsnobietyCom", "pages") entrance = "https://www.highsnobiety.com/style/" driver = webdriver.Chrome( 'C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe') driver.get(entrance) print("waiting for u") doclist = [] # 此处设置断点 total = 1 html = driver.page_source.encode('utf-8') soup = BeautifulSoup(html, "lxml") div = soup.find("div", attrs={"class": "sub-contents__item"}) articles = div.find_all("article") for i in articles: try: a = i.find("a") url = a['href'] filename = HttpHelper.fetchAndSave(url, "utf-8", "D:/pages/highsnobiety.com") doclist.append({ "filename": filename, "url": url, "state": "fetched", "domain": "www.highsnobiety.com" }) print(total) total += 1 except Exception as err: print(err) collection.insertMany(doclist)
def fetch_fashionista(): collection = MongoHelper("172.16.40.140", 27017, "ZDBFashionistaCom", "pages") entrance = "https://fashionista.com/style" driver = webdriver.Chrome( 'C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe') driver.get(entrance) print("waiting for u") doclist = [] # 此处设置断点 total = 1 html = driver.page_source.encode('utf-8') soup = BeautifulSoup(html, "lxml") articles = soup.find_all( "article", attrs={"class": "m-card mm-card--landscape-image mm-card--type-list"}) for i in articles: try: a = i.find( "a", attrs={"class": "m-card--image-link m-background-image"}) url = "https://fashionista.com" + a['href'] filename = HttpHelper.fetchAndSave(url, "utf-8", "D:/pages/fashionista.com") doclist.append({ "filename": filename, "url": url, "state": "fetched", "domain": "www.fashionista.com" }) print(total) total += 1 except Exception as err: print(err) collection.insertMany(doclist)
def test(): collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, "pages") doclist = [] total = 1 while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for page in doclist: try: if page['state'] != 'fetched': continue prefix = page['filename'][0:1] filepath = DOMAIN + prefix + "/" + page['filename'] with open(filepath, encoding="utf-8") as fp: html = fp.read() task = { "id": "id", "url": page['url'], 'topic': 'crawler_data_p123', 'routingKey': '256' } # dx.com 225, banggood.com 224 ,tomtop 195 ,gearbest 256 sendPage(task, html) fp.close() page['state'] = 'sended' collection.updateOne(page) print(total) total += 1 except Exception as err: print(err)
def test2(): collection = MongoHelper("172.16.40.140", 27017, "ZDBGearbestCom", "pages") while True: slist = collection.nextPage(10) if len(slist) == 0: break for article in slist: article['state'] = "fetched" collection.updateOne(article)
def parseAllBlog(): try: collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'blog') total = 0 while True: blogList = collection.nextPage(100) if len(blogList) == 0: break for blog in blogList: if blog['state'] == 'PARSED': filePath = HttpHelper.getFullPath(HTML_ROOT_PATH, blog['fileName']) with open(filePath, 'r', encoding='utf-8') as file: html = file.read() found, title, desc, ogTitle, ogDesc, twTitle, twDesc, keywords, content, author, summary, summaryKeywords = parseBlog( html) if found and (title != None or ogTitle != None or twTitle != None): doc = { 'title': title, 'ogTitle': ogTitle, 'twTitle': twTitle, 'desc': desc, 'ogDesc': ogDesc, 'twDesc': twDesc, 'keywords': keywords, 'content': content, 'author': author, 'summary': summary, 'summaryKeywords': summaryKeywords } blog['doc'] = doc blog['state'] = 'PARSED' collection.updateOne(blog) print("ok") else: print("error") break total += 1 print("url=" + blog['url']) print("total=" + str(total)) except Exception as err: print(err) finally: print("exit")
def fetch_gearbest(): collection = MongoHelper("172.16.40.140", 27017, "ZDBGearbestCom", "pages") entrance = "https://www.gearbest.com/health-care-c_11689/" pageNum = 1 total = 1 doc = [] while pageNum < 11 and entrance is not None: html = HttpHelper.fetch(entrance)[1] soup = BeautifulSoup(html, "lxml") proUl = soup.find_all("ul", attrs={"class": "clearfix js_seachResultList"}) for proList in proUl: li = proList.find_all("li") for i in li: try: photo = i.find_all( "a", attrs={ "class": "icon-loading gbGoodsItem_thumb js-selectItemd" }) for j in photo: url = j['href'] filename = HttpHelper.fetchAndSave( url, "utf-8", "D:/pages/gearbest.com") doc.append({ "filename": filename, "url": url, "state": "fetched" }) print(total) total += 1 except Exception as err: print(err) a = soup.find_all("a", attrs={"class": "pageNext"}) if a is None: entrance = None print("NO." + str(pageNum)) pageNum += 1 continue next = a[-1] entrance = next['href'] print("NO." + str(pageNum)) pageNum += 1 if doc != []: collection.insertMany(doc)
def Resolve(): collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, "pages") doclist = [] while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for article in doclist: try: if article['state'] != "fetched": continue prefix = article['filename'][0:1] filepath = DOMAIN + prefix + "/" + article['filename'] with open(filepath, encoding="utf-8") as fp: html = fp.read() p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12 = parseBlog( html) md5 = CryptHelper.getMD5Hash(article['url']) key = UrlHelper.getHostPath(article['url'])[1] excerpt = p3 if p3 else p11 doc = { "_id": article['_id'], "filename": article['filename'], "url": article['url'], "state": "pass", "domain": article['domain'], 'md5': md5, 'title': p2, 'excerpt': excerpt, 'content': p9, 'author': article['domain'], 'categories': CATEGORY, 'tags': "", 'status': 0, 'key': key } collection.updateOne(doc) doc.clear() fp.close() except Exception as err: print(err) '''for i in doc:
def importAllArticle(MONGO_HOST, MONGO_DATABASE_NAME, IMPORT_URL): try: articleCollection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'pages') total = 0 while True: articleList = articleCollection.nextPage(10) if len(articleList) == 0: break total += len(articleList) print("total=" + str(total)) newArticleList = [] for article in articleList: if article['state'] != "pass": continue #print (str(article['_id'])) doc = { 'id': article['md5'], 'title': article['title'], 'excerpt': article['excerpt'], 'content': "", 'author': article['domain'], 'domain': article['domain'], 'categories': article['categories'], 'tags': article['tags'], 'url': article['url'], 'status': article['status'], 'key': article['key'], } newArticleList.append(doc) errorCode, rsp = HttpHelper.post(IMPORT_URL, newArticleList) if errorCode == "OK" and rsp != None and 'isOk' in rsp and rsp[ 'isOk'] == True: print("import article ok") else: print("import article error") newArticleList.clear() article['state'] = "sended" articleCollection.updateOne(article) except Exception as err: print(err) finally: print("exit")
def test3(): collection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg", "supplement_copy", "url") doclist = [] total = 0 while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for i in doclist: if i['state'] != "completed": # 只操作经过test2()处理的数据 continue contenthtml = "" for j in i['attrlist']: contenthtml1 = '<h3 class="h3-subtitle">' + j['subtitle'] + '</h3><br/>' \ + '<div class="div-content>'+ j['innerhtml'] + '</div><br/>' contenthtml += contenthtml1 doc = { "_id": i['_id'], "cat": i['cat'], "fileName": i['fileName'], "url": i['url'], "host": i['host'], "state": "built", "title": i['title'], "content": i['content'], "description": i['description'], "attrlist": i['attrlist'], "contenthtml": contenthtml } collection.updateOne(doc) doc.clear() total += 1 print(total)
def fetch_banggood(): collection = MongoHelper("172.16.40.140", 27017, "ZDBBgoodCom", "pages") entrance = "https://www.banggood.com/Wholesale-Indoor-Lighting-c-2514.html" pageNum = 1 total = 1 doc = [] while pageNum < 11 and entrance is not None: html = HttpHelper.fetch(entrance)[1] soup = BeautifulSoup(html, 'lxml') proUl = soup.find("ul", attrs={"class": "goodlist_1"}) li = proUl.find_all("li") for i in li: try: photo = i.find_all("span", attrs={"class": "img"}) for j in photo: a = j.find("a") url = a['href'] filename = HttpHelper.fetchAndSave( url, 'utf-8', 'D:/pages/banggood.com') doc.append({ "filename": filename, "url": url, "state": "fetched" }) print(total) total += 1 except Exception as err: print(err) div = soup.find("div", attrs={"class": "page_num"}) next = div.find("a", attrs={"id": "listNextPage"}) if next is None: entrance = None print("NO." + str(pageNum)) pageNum += 1 continue entrance = next['href'] print("NO." + str(pageNum)) pageNum += 1 if doc != []: collection.insertMany(doc)
def fetch_tomtop(): collection = MongoHelper("172.16.40.140", 27017, "ZDBTomtopCom2", "pages") entrance = "https://www.tomtop.com/vehicle-infotainment-11035/" pageNum = 1 total = 1 doc = [] while pageNum < 11 and entrance is not None: html = HttpHelper.fetch(entrance)[1] soup = BeautifulSoup(html, 'lxml') proUl = soup.find("ul", attrs={"class": "lbBox categoryProductList"}) li = proUl.find_all("li") for i in li: try: photo = i.find_all("div", attrs={"class": "productImg"}) for j in photo: a = j.find("a") url = "https://www.tomtop.com" + a['href'] filename = HttpHelper.fetchAndSave(url, 'utf-8', 'D:/pages/tomtop.com') doc.append({ "filename": filename, "url": url, "state": "fetched" }) print(total) total += 1 except Exception as err: print(err) ul = soup.find("ul", attrs={"class": "lbBox pagingWarp"}) next = ul.find("li", attrs={"class": "lineBlock pageN pageClick"}) if next is None: entrance = None print("NO." + str(pageNum)) pageNum += 1 continue entrance = "https://www.tomtop.com" + next.find("a")['href'] print("NO." + str(pageNum)) pageNum += 1 if doc != []: collection.insertMany(doc)
def fetch_whowhatwear(): collection = MongoHelper("172.16.40.140", 27017, "ZDBWhowhatwearCom", "pages") entrance = "https://www.whowhatwear.com/channel/trends" driver = webdriver.Chrome( 'C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe') driver.get(entrance) print("waiting for u") doclist = [] # 此处设置断点 total = 1 html = driver.page_source.encode('utf-8') soup = BeautifulSoup(html, "lxml") div = soup.find( "div", attrs={ "class": "card__group card__group--river card__group--river-channel" }) articles = div.find_all( "div", attrs={ "class": "card__item card__item--river card__item--river-channel" }) for i in articles: try: a = i.find("a") url = "https://www.whowhatwear.com" + a['href'] filename = HttpHelper.fetchAndSave(url, "utf-8", "D:/pages/whowhatwear.com") doclist.append({ "filename": filename, "url": url, "state": "fetched", "domain": "www.whowhatwear.com" }) print(total) total += 1 except Exception as err: print(err) collection.insertMany(doclist)
def fetchAllBlog(): try: catCollection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'category') collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'blog') total = 0 while True: blogList = collection.nextPage(100) if len(blogList) == 0: break for blog in blogList: if blog['state'] == 'CLOSED': fileName = HttpHelper.fetchAndSave(blog['url'], "utf-8", HTML_ROOT_PATH) if fileName != None and len(fileName) > 0: blog['fileName'] = fileName blog['state'] = "FETCHED" else: blog['state'] = "CLOSED" collection.updateOne(blog) total += 1 print("url=" + blog['url']) print("total=" + str(total)) except Exception as err: print(err) finally: print("exit")
def amazonfetch_detail(): doclist = [] total = 1 collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter") while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for x in doclist: if x['state'] != "fetched": continue try: status, html = HttpHelper.fetch(x['url']) soup = BeautifulSoup(html, "lxml") title = soup.find_all("span", attrs={"id": "productTitle"}) for i in title: text = i.text title = text.strip() a = soup.find_all("a", attrs={"id": "bylineInfo"}) # bylineInfo brand for i in a: href = i['href'] if re.match("^/{1}.*", href): href = "http://www.amazon.com" + href description = soup.find_all("ul", attrs={"class": "a-unordered-list a-vertical a-spacing-none"}) for i in description: doc = {"_id": x['_id'], "brand": x['brand'], "url": x['url'], "state": "pass", "price": x['price'] , "title": title, "brand_a": href, "inner_des": str(i)} collection.updateOne(doc) doc.clear() except Exception as err: print(err) continue print(total) total += 1
def fetch_dx(): collection = MongoHelper("172.16.40.140", 27017, "ZDBDxCom", "pages") entrance = "http://www.dx.com/c/computer-office-399/networking-314" pageNum = 1 total = 1 doc = [] while pageNum < 11 and entrance is not None: html = HttpHelper.fetch(entrance)[1] soup = BeautifulSoup(html, "lxml") proUl = soup.find_all("ul", attrs={"class": "productList subList"}) for proList in proUl: li = proList.find_all("li", attrs={"class": "c_cates"}) for i in li: try: photo = i.find("div", attrs={"class": "photo"}) url = "https://www.dx.com" + photo.find("a")['href'] filename = HttpHelper.fetchAndSave(url, "utf-8", "D:/pages/dx.com") doc.append({ "filename": filename, "url": url, "state": "fetched" }) print(total) total += 1 except Exception as err: print(err) a = soup.find_all("a", attrs={"class": "next"}) if a is None: entrance = None print("NO." + str(pageNum)) pageNum += 1 continue next = a[-1] entrance = "https://www.dx.com" + next['href'] print(entrance) print(pageNum) pageNum += 1 if doc != []: collection.insertMany(doc)
def updateAllArticle(MONGO_HOST, MONGO_DATABASE_NAME, IMPORT_URL): try: articleCollection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'article') total = 0 while True: articleList = articleCollection.nextPage(10) if len(articleList) == 0: break newArticleList = [] for article in articleList: total += 1 print("total=" + str(total)) url = article['url'] retry = 0 while True: retry += 1 if retry > 2: break statusCode, html = HttpHelper.fetch(url) if html != None and len(html) > 0: article['status'] = 0 # Check title, TODO print("update article ok, retry=" + str(retry) + ", url=" + url) break else: article['status'] = -1 print("update article error, retry=" + str(retry) + ", url=" + url) time.sleep(1) article['updateTime'] = datetime.now() articleCollection.updateOne(article) except Exception as err: print(err) finally: print("exit")
def fetch_fashionbeans(): collection = MongoHelper("172.16.40.140", 27017, "ZDBFashionbeansCom", "pages") entrance = "http://www.fashionbeans.com/category/mens-hairstyles/" doclist = [] total = 1 num = 1 while total < 200 and entrance is not None: html = HttpHelper.fetch(entrance) soup = BeautifulSoup(html[1], "lxml") div = soup.find("div", attrs={"id": "catmainBody"}) articles = div.find_all("div", attrs={"class": "catArticles"}) for i in articles: try: a = i.find("a", attrs={"class": "left relative"}) url = a['href'] filename = HttpHelper.fetchAndSave( url, "utf-8", "D:/pages/fashionbeans.com") doclist.append({ "filename": filename, "url": url, "state": "fetched", "domain": "www.fashionbeans.com" }) print(total) total += 1 except Exception as err: print(err) a = soup.find("a", attrs={"class": "nextLink right"}) print("页数:" + str(num)) if a is None: entrance = None continue num += 1 entrance = a['href'] if doclist != []: collection.insertMany(doclist)
def initCat(): catDict = { '糖尿病': 'diabetes', '肺癌': 'Lung cancer', '风湿': 'Rheumatism', '牛皮癣': 'Psoriasis', '肺梗阻': 'Pulmonary obstruction', '失禁': 'Incontinence', 'aarp补充医疗': 'aarp supplementary medical treatment', '其他疾病': 'other illnesses', '乳腺癌': 'Breast cancer', '多发性硬化症': 'Multiple sclerosis', '哮喘': 'Asthma', '药瘾': 'Drug addiction', '酒瘾': 'Alcoholism', # '小企业融资':'Small Business Financing', # 'business phone':'business phone', # 'network security':'network security', # 'cloud':'cloud', # '商业软件':'commercial software', # '小企业软件':'Small Business Software', # '财务软件':'financial software', # '税务软件':'Tax software', # '小企业安卓财务app':'Small Business Android Finance app', # '薪酬管理软件':'Compensation Management Software', # 'SAP':'SAP', # '补丁管理软件':'Patch management software', # '网络服务':'Internet service', # '域名':'Domain name', # '虚拟服务器':'virtual server' } collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'category') for cname in catDict.keys(): ename = catDict[cname] slug = ename.lower().replace(" ", "-") doc = {"ename": ename, "cname": cname, "slug": slug} collection.insertOne(doc)
def fetch_digitaltrends(): collection = MongoHelper("172.16.40.140", 27017, "ZDBDigitaltrendsCom", "pages") entrance = "https://www.digitaltrends.com/tv-reviews/" doclist = [] total = 1 while total < 120 and entrance is not None: html = HttpHelper.fetch(entrance) soup = BeautifulSoup(html[1], "lxml") div = soup.find("div", attrs={"class": "m-products"}) item = div.find_all("div", attrs={"class": "item"}) for i in item: try: h3 = i.find("h3", attrs={"class": "title"}) url = h3.find("a")['href'] filename = HttpHelper.fetchAndSave( url, "utf-8", "D:/pages/digitaltrends.com") doclist.append({ "filename": filename, "url": url, "state": "fetched", "domain": "www.digitaltrends.com" }) print(total) total += 1 except Exception as err: print(err) a = soup.find_all("a", attrs={"class": "next page-numbers"}) if a is None: entrance = None continue next = a[-1] entrance = next['href'] if doclist != []: collection.insertMany(doclist)
def fetch_cnet(): collection = MongoHelper("172.16.40.140", 27017, "ZDBCnetCom", "pages") entrance = "https://www.cnet.com/topics/tablets/products/" doclist = [] total = 1 while total < 80 and entrance is not None: html = HttpHelper.fetch(entrance) soup = BeautifulSoup(html[1], "lxml") section1 = soup.find("section", attrs={"id": "dfllResults"}) section2 = section1.find_all( "section", attrs={"class": "col-3 searchItem product "}) for i in section2: try: a = i.find("a", attrs={"class": "imageWrap"}) url = "https://www.cnet.com" + a['href'] filename = HttpHelper.fetchAndSave(url, "utf-8", "D:/pages/cnet.com") doclist.append({ "filename": filename, "url": url, "state": "fetched", "domain": "www.cnet.com" }) print(total) total += 1 except Exception as err: print(err) a = soup.find_all("a", attrs={"class": "next"}) if a is None: entrance = None continue next = a[-1] entrance = "https://www.cnet.com" + next['href'] if doclist != []: collection.insertMany(doclist)
def createAllPost(): try: pdCollection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg", "supplement_copy") total = 0 while True: pdList = pdCollection.nextPage(10) if pdList == None or len(pdList) == 0: break for pd in pdList: if pd['state'] != 'built': continue newID = createPost(pd) if newID != None: doc = {"_id": pd['_id'], "ID":newID, "cat": pd['cat'], "fileName": pd['fileName'], "url": pd['url'], "host": pd['host'], "state": "posted", "title": pd['title'], "content": pd['content'], "description": pd['description'], "attrlist": pd['attrlist'], "contenthtml": pd['contenthtml']} print(doc['ID']) pdCollection.updateOne(doc) doc.clear() print ("create post ok") else: print ("create post error") total += 1 print('total=' + str(total) + ', title=' + pd['title']) print ('Creawte all posts ok') except Exception as err : print(err)
def test(): collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "drugs", "url") doclist = [] print("dbcom") html = HttpHelper.fetch("https://www.drugs.com/alpha/a5.html") soup = BeautifulSoup(html[1]) # print(soup) list = soup.find_all('ul', attrs={"class": "doc-type-list"}) # list = soup.find_all('ul', attrs={"class": re.compile('doc-type*')}) # print(list) for i in list: li = i.find_all('a') for j in li: a = j.text # print(a) b = j['href'] # print(b) print('\n') doclist.append({"url": a, "title": b}) print(doclist) collection.insertMany(doclist)
def dump(): collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'keyword') total = 0 while True: list = collection.nextPage(100) if list == None or len(list) == 0: break total += len(list) print("total=" + str(total)) print("total=" + str(total)) collection.resetStartId() total = 0 while True: list = collection.nextPage(100) if list == None or len(list) == 0: break total += len(list) print("second total=" + str(total)) print("second total=" + str(total))
def test_chromedriver(): try: total = 1 doclist = [] collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter") while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) for page in doclist: print(total) total += 1 if page['state'] != 'pass': continue driver = webdriver.Chrome('C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe') driver.get(page['url']) print("wait for u") size = driver.find_elements_by_tag_name("img") for i in size: if i.location['x'] == 19 or i.location['x'] == 71: if i.size == {'height': 40, 'width': 40}: ActionChains(driver).move_to_element(i).click(i).perform() html = driver.page_source.encode('utf-8') driver.close() soup = BeautifulSoup(html, "lxml") with open("./product.csv", "a+", newline='', encoding="utf-8") as c: writer = csv.writer(c, dialect='excel') list = soup.find_all("div", attrs={"class": "imgTagWrapper"}) img = "" for i in list: imge = i.find_all("img") for j in imge: img = img + j['src'] + "," img = img[0:-1] price = "" pricetxt = soup.find_all("span", attrs={"id": "priceblock_ourprice"}) for i in pricetxt: price = i.text price = price.strip() des = "" text2 = soup.find_all("div", attrs={"id": "productDescription"}) '''div,class:aplus-v2 desktop celwidget div id: productDescription ''' for i in text2: des = i des = des.encode("utf-8").decode() des = des.strip() des_html = "<div class=\"productdescription\">" + des +"</div>" img = img.encode("utf-8").decode() img = img.strip() sdes = page['inner_des'] sdes = "<div class = \"short-des\">" + "<a href = \"" + page['brand_a'] + "\">" + \ "<font size=1 color=blue>" + page['brand'] + "</font></a><br>About the product<br>"\ + sdes + "</div>" writer.writerow(['', 'simple', '', page['title'], '1', '0', 'visible', sdes, des_html, '', '', 'taxable', '', '1', '', '0', '0', '', '', '', '', '1', '', '', price, 'blood glucose meter', '', '', img, '', '', '', '', '', '', '', '', '0']) print("csv ok") doc = {"_id": page['_id'], "brand": page['brand'], "url": page['url'], "state": "posted", "price": price, "title": page['title'], "brand_a": page['brand_a'], "inner_des": page['inner_des'], "product_des": des} collection.updateOne(doc) doc.clear() print("mongo ok") c.close() except Exception as err: print(err)
def test2(): ''' 以下过程为提取collection中的所有url,最后得到集合doclist :return: ''' collection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg", "supplement_copy", "url") nlp = NLPHelper() doclist = [] doc = [] # doc作为新key:attrlist的 while True: slist = collection.nextPage(100) if slist == None or len(slist) == 0: break for i in slist: doclist.append(i) # print(doclist) total = 0 ''' 以下过程为解析每一个url ''' for i in doclist: if i['state'] != "FETCHED": # 只采集state为FETCHED的对象 continue # 来源为一号网站时 if i['cat'] == 1: html = HttpHelper.fetch(i['url']) soup = BeautifulSoup( html[1]) # html结构为[statusCode, html],所采集标号为1的元素 slist = soup.find_all("section") content = "" # 初始化content为一个空字符串 for j in slist: # j是section hlist = j.find_all("h2") # tag为h2的都是小标题 for x in hlist: # x是title title = x.text tlist = j.find_all("div", attrs={"class": "section-body"}) for y in tlist: # y是具体一个上面小标题对应的内容 doc.append({ "subtitle": title, "innerhtml": str(y), "text": y.text }) content += y.text # 总的一页内容是每一个小标题的内容之和 description = nlp.getSummary(content, wordCount=20) # 创建描述 ''' 加入要覆盖当前collection的doc ''' doc2 = { "_id": i['_id'], "cat": i['cat'], "fileName": i['fileName'], "url": i['url'], "host": i['host'], "state": "completed", "title": i['title'], "content": content, "description": description, "attrlist": doc } collection.updateOne(doc2) doc.clear() doc2.clear() # 每完成一次更新将两个doc清空 total += 1 print(total) # 打印出当前完成的document总数 # 来源二号网站,原理基本相同 elif i['cat'] == 2: html = HttpHelper.fetch(i['url']) soup = BeautifulSoup(html[1]) slist = soup.find_all( "div", attrs={"class": re.compile('field field-name-body*')}) content = "" for j in slist: hlist = j.find_all("h2") titlearr = [] # 用来存放当前页面的小标题,以便在插入时与innerhtml一一对应 for x in hlist: # x是title title = x.text titlearr.append(title) tlist = j.find_all("ul") index = 0 for y in tlist: if index > len(titlearr) - 1: # 防止索引越界 break doc.append({ "subtitle": str(titlearr[index]), "innerhtml": str(y), "text": y.text }) content += y.text index = index + 1 titlearr.clear() #print(content) description = nlp.getSummary(content, wordCount=20) doc2 = { "_id": i['_id'], "cat": i['cat'], "fileName": i['fileName'], "url": i['url'], "host": i['host'], "state": "completed", "title": i['title'], "content": content, "description": description, "attrlist": doc } collection.updateOne(doc2) doc.clear() doc2.clear() # print(j) total += 1 print(total)
def amazonfetch(): total = 1 goods = 1 url = "https://www.amazon.com/s/ref=sr_as_oo?rh=i%3Aaps%2Ck%3Ablood+pressure+monitor&keywords=blood+pressure+mon" \ "itor&ie=UTF8&qid=1527130301" collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter") '''excel = xlwt.Workbook() sheet = excel.add_sheet("Blood glucose meter") content = "brand" sheet.write(0,0,content) content = "url" sheet.write(0,1,content) row = 1''' doc = [] while url != None: statuscode, html = HttpHelper.fetch(url) soup = BeautifulSoup(html) for s in soup('script'): s.extract() #print(soup.prettify()) #return li_all = soup.find_all("li", attrs={"id":re.compile("^result_\d{1,2}")}) #print(li_all[3]) #return for li in li_all: print("正在检查第"+ str(goods) + "件商品") goods += 1 flag = li.find_all("p",attrs={"class":"acs-mn2-midwidgetHeader"}) if flag != []: print(flag) continue #print("flagok") a = li.find_all("a", attrs={"class": re.compile("^a-link-normal s-access-detail-page.*")}) if a == []: continue #print("aok") for i in a: url2 = i['href'] branddiv = li.find_all("div", attrs={"class": "a-row a-spacing-none"}) if branddiv == []: continue #print("brandok") brand = "" for i in branddiv: span = i.find_all("span", attrs={"class": "a-size-small a-color-secondary"}) if span == []: continue #print("spanok") for j in span: brand += j.text brand = brand[3:] p = li.find_all("span", attrs={"class": "sx-price-whole"}) if p == []: continue for i in p: price = i.text if price == []: continue #print("priceok") div = li.find_all("div", attrs={"class":"a-row a-spacing-mini"}) if div == []: continue #print("divok") for j in div: comment_all = j.find_all("a", attrs={"class":"a-size-small a-link-normal a-text-normal"}) if comment_all == []: continue #print("comok") for i in comment_all: comment = i.text print("price的类型是:") print(type(price)) print(type(comment)) price = price.replace(",", "") comment = comment.replace(",", "") print(price) print(comment) try: if isinstance(price, str): price1 = int(price) if isinstance(comment, str): comment1 = int(comment) except Exception as err: print(err) continue if price1 > 20 and price1 < 50 and comment1 > 100: print(brand) print("No." + str(total)) total +=1 url3 = url2 if re.match("^/{1}.*", url2): url3 = "https://www.amazon.com" + url2 '''sheet.write(row,0,brand) sheet.write(row,1,url3) row += 1''' doc.append({'brand': brand, 'url': url3, 'state': 'fetched', 'price': price + ".99"}) if total > 90: print("completed") #excel.save("D:/电商/test.xls") collection.insertMany(doc) return next_page = soup.find_all("a",attrs={"id":"pagnNextLink"}) if next_page == []: url = None continue for i in next_page: if re.match("^/{1}.*", i['href']): url = "https://www.amazon.com"+ i['href'] else: url = i['href'] print("not enough 90") # excel.save("D:/电商/test.xls") collection.insertMany(doc)