def parse_url(url="http://www.highya.com/"): page = req_main(url) if page: soup = BeautifulSoup(page) latest_reviews_h3 = soup.find("h3", text=re.compile("latest reviews:")) if latest_reviews_h3: post_div = latest_reviews_h3.find_next("div", attrs={"class":"clearfix like-right-col"}) if post_div: all_li = post_div.find_all("li") all_a = [li.find("a") for li in all_li] for a in all_a: link = "http://www.highya.com%s" % a.get("href") page2 = req_main(url) if page2: soup2 = BeautifulSoup(page2) article_tag = soup2.find("article", attrs={"class":"product-article"}) if article_div: header = article_tag.find("header") domain = "www.highya.com" main_title = soup2.find("title").text() main_title_link = link article_div = article_tag.find("div", attrs={"class":"site-section section-article"}) blog_title = article_div.find("h2").text() blog_link = link itemtype = header.get("itemtype").split("/")[-1] category = itemtype cat_link = link entry_content = article_div entry_text = article_div.text()
def home_page_link(self): self.creat_avv_blog_scrap_table() # r = self.req_proxy() # page = r.content # r.close() link_list = [ "http://testolimitfacts.com/", # "http://testolimitfacts.com/testo-limit-review/", # "http://testolimitfacts.com/testo-limit-review/", # "http://testolimitfacts.com/slimgenix-pro/", # "http://testolimitfacts.com/power-pro/", # "http://testolimitfacts.com/addium-brain-enhancer-another-scam/", # "http://testolimitfacts.com/enduros-male-enhancement/", # "http://testolimitfacts.com/testo-xl/", # "http://testolimitfacts.com/is-spartagen-xt-scam/", # "http://testolimitfacts.com/elite-test-360/", # "http://testolimitfacts.com/honest-green-coffee-bean-extract/", # "http://testolimitfacts.com/premium-natural-garcinia-cambogia/", # "http://testolimitfacts.com/maximum-shred/", # "http://testolimitfacts.com/extreme-home-profits-review-worth-the-money-or-a-scam/", # "http://testolimitfacts.com/30-day-change/", # "http://testolimitfacts.com/100-day-loans/", ] for link in link_list: # page = main_req(link) page = req_main(link) if page: self.get_link_from_first_page(link, page)
def home_page_link(self): self.creat_avv_blog_scrap_table() # r = self.req_proxy() # page = r.content # r.close() link_list = ["http://www.healthyminimarket.com", # "http://www.healthyminimarket.com/page/2/", # "http://www.healthyminimarket.com/page/3/", # "http://www.healthyminimarket.com/page/4/", # "http://www.healthyminimarket.com/page/5/", # "http://www.healthyminimarket.com/page/6/", # "http://www.healthyminimarket.com/page/7/", # "http://www.healthyminimarket.com/page/8/", # "http://www.healthyminimarket.com/page/9/", # "http://www.healthyminimarket.com/page/10/", # "http://www.healthyminimarket.com/page/11/", # "http://www.healthyminimarket.com/page/12/", # "http://www.healthyminimarket.com/page/13/", # "http://www.healthyminimarket.com/page/15/", # "http://www.healthyminimarket.com/page/16/" ] for link in link_list: # page = main_req(link) page = req_main(link) if page: self.get_link_from_first_page(page)
def open_home_page(self): self.creat_avv_blog_scrap_table() # r = self.req_proxy() # page = r.content # r.close() link = "http://www.healthcaresdiscussion.com" #page = main_req(link) page = req_main(link) if page: link_to_extract = self.get_all_link_home_page(page) map(self.get_page_next_link, link_to_extract)
def get_page_next_link(self, link): sql = """SELECT * FROM avv_blog_scrap_table WHERE blog_link = '%s' """ % (self.my_strip(link)) self.cursor.execute(sql) results = self.cursor.fetchall() if not results: # r2 = self.req_proxy(link=link) # page2 = r2.content # r2.close() #page2 = main_req(link) page2 = req_main(link) if page2: self.get_detail_next_page(link, page2)
def prev_home_page(self): self.creat_avv_blog_scrap_table() link_lists = ['http://www.healthcaresdiscussion.com/page/2/', 'http://www.healthcaresdiscussion.com/page/3/'] for link in link_lists: # r = self.req_proxy(link=link) # page = r.content # r.close() page = req_main(link) if page: link_to_extract = self.get_all_link_home_page(page) map(self.get_page_next_link, link_to_extract)