def scrape_page_map(self, sub_site, url, bs): id = url site_url = urlparse(url).netloc.split('.')[1] sub_site_url = urlparse(url).path.split('/') sub_site_name = '-'.join(sub_site[1:-1]) if sub_site_name == '': sub_site_name = 'Home' pagemap = models.PageMap() pagemap.page_id = id pagemap.site = self.site pagemap.sub_site = sub_site pagemap.url = url pagemap.section = '' try: # get posted date pagemap.published_date = datetime.today() except: pass try: # get page pagemap.page = bs.get_text() except: pass try: # get title if bs.title != None: pagemap.title = bs.title.text else: pagemap.title = '' except: pass data = elastic.convert_for_bulk(pagemap, 'update') return data
def scrape_page_map(self, sub_site, url, bs): id = url pagemap = models.PageMap() pagemap.page_id = id pagemap.site = self.site pagemap.sub_site = sub_site pagemap.url = url # get posted date try: pagemap.posted_date = datetime.today() author_info_tag = bs.find("div", class_="author_info") published = author_info_tag.find('p', class_='date').text pagemap.posted_date = datetime.strptime(published, '%d-%b-%Y') except: pass try: box_1_tag = bs.find("div", class_="box_1") product_info_bar_tag = box_1_tag.find("div", class_="product_info_bar") published = re.search(r'([0-9]{2}-[a-z,A-Z]{3}-[0-9]{4})', product_info_bar.text, re.MULTILINE) pagemap.posted_date = datetime.strptime(published.group(0), '%d-%b-%Y') except: pass # get page try: pagemap.page = bs.get_text() box_1_tag = bs.find("div", class_="box_1") pagemap.page = box_1_tag.text product_main_text_tag = box_1_tag.find("div", class_="product_main_text") if product_main_text_tag != None: pagemap.page = product_main_text_tag.text else: story_tag = box_1_tag.find("div", class_="story") pagemap.page = story_tag.text except: pass # get title try: if bs.title != None: pagemap.title = bs.title.text else: pagemap.title = '' box_1_tag = bs.find("div", class_="box_1") pagemap.title = box_1_tag.find("h1").text except: pass # get section try: box_2_tag = bs.find("div", class_="box_2") pagemap.section = box_2_tag.text.strip(' \t\n\r') except: pass data = elastic.convert_for_bulk(pagemap, 'update') return data
def scrape_page_map(self, sub_site, url, bs): id = url pagemap = models.PageMap() pagemap.page_id = id pagemap.site = self.site pagemap.sub_site = sub_site pagemap.url = url # get posted date # <span class="entry-date">May 23, 2017</span> try: pagemap.posted_date = datetime.today() entry_date_tag = bs.find("span", class_="entry-date") published = entry_date_tag.text pagemap.posted_date = datetime.strptime(published, '%B %d, %Y') except: pass #try: # box_1_tag = bs.find("div", class_="box_1") # product_info_bar_tag = box_1_tag.find("div", class_="product_info_bar") # published = re.search(r'([0-9]{2}-[a-z,A-Z]{3}-[0-9]{4})', product_info_bar.text, re.MULTILINE) # pagemap.posted_date = datetime.strptime(published.group(0), '%d-%b-%Y') #except: # pass # get page # <section class="entry-content"> try: pagemap.page = bs.get_text() entry_content_tag = bs.find("section", class_="entry-content") pagemap.page = entry_content_tag.text except: pass # get title # <h1 class="entry-title"></h1> text try: if bs.title != None: pagemap.title = bs.title.text else: pagemap.title = '' entry_title_tag = bs.find("h1", class_="entry-title") pagemap.title = entry_title_tag.text except: pass # get section try: pagemap.section = sub_site except: pass data = elastic.convert_for_bulk(pagemap, 'update') return data
def scrape_page_map(self, sub_site, url, bs): id = url pagemap = models.PageMap() pagemap.page_id = id pagemap.site = self.site pagemap.sub_site = sub_site pagemap.url = url # get posted date # <span class="entry-date">May 23, 2017</span> try: pagemap.published_date = datetime.today() entry_date_tag = bs.find("span", class_="entry-date") published = entry_date_tag.text pagemap.published_date = datetime.strptime(published, '%B %d, %Y').date() except: pass # get page # <section class="entry-content"> try: pagemap.page = bs.get_text() entry_content_tag = bs.find("section", class_="entry-content") pagemap.page = entry_content_tag.text except: pass # get title # <h1 class="entry-title"></h1> text try: if bs.title != None: pagemap.title = bs.title.text else: pagemap.title = '' entry_title_tag = bs.find("h1", class_="entry-title") pagemap.title = entry_title_tag.text except: pass # get section try: pagemap.section = sub_site except: pass data = elastic.convert_for_bulk(pagemap, 'update') return data
def scrape_page_map(self, sub_site, url, bs): id = url pagemap = models.PageMap() pagemap.page_id = id pagemap.site = self.site pagemap.sub_site = sub_site pagemap.url = url article_tag = bs.find('article') try: # posted date published = article_tag.find('time').text pagemap.published_date = datetime.strptime(published, '%d-%b-%Y').date() except: pass try: # title if bs.title != None: pagemap.title = bs.title.text else: pagemap.title = article_tag.header.h1.text except: pass try: # section if sub_site in ['Skin-care', 'Hair-care']: pagemap.section = article_tag.header.p.text.strip() else: pagemap.section = 'blog' except: pass try: # img_src pagemap.img_src = article_tag.header.figure.img.attrs['src'] except: pass try: # page pagemap.page = article_tag.find('div', class_='Detail-content').text except: pass data = elastic.convert_for_bulk(pagemap, 'update') return data