Example #1
0
    def scrape_page_map(self, sub_site, url, bs):
        id = url
        site_url = urlparse(url).netloc.split('.')[1]
        sub_site_url = urlparse(url).path.split('/')
        sub_site_name = '-'.join(sub_site[1:-1])
        if sub_site_name == '':
            sub_site_name = 'Home'
        pagemap = models.PageMap()

        pagemap.page_id = id
        pagemap.site = self.site
        pagemap.sub_site = sub_site
        pagemap.url = url
        pagemap.section = ''

        try:  # get posted date
            pagemap.published_date = datetime.today()
        except:
            pass
        try:  # get page
            pagemap.page = bs.get_text()
        except:
            pass
        try:  # get title
            if bs.title != None:
                pagemap.title = bs.title.text
            else:
                pagemap.title = ''
        except:
            pass

        data = elastic.convert_for_bulk(pagemap, 'update')
        return data
Example #2
0
    def scrape_page_map(self, sub_site, url, bs):
        id = url
        pagemap = models.PageMap()
        pagemap.page_id = id
        pagemap.site = self.site
        pagemap.sub_site = sub_site
        pagemap.url = url

        # get posted date
        try:
            pagemap.posted_date = datetime.today()
            author_info_tag = bs.find("div", class_="author_info")
            published = author_info_tag.find('p', class_='date').text
            pagemap.posted_date = datetime.strptime(published, '%d-%b-%Y')
        except:
            pass
        try:
            box_1_tag = bs.find("div", class_="box_1")
            product_info_bar_tag = box_1_tag.find("div",
                                                  class_="product_info_bar")
            published = re.search(r'([0-9]{2}-[a-z,A-Z]{3}-[0-9]{4})',
                                  product_info_bar.text, re.MULTILINE)
            pagemap.posted_date = datetime.strptime(published.group(0),
                                                    '%d-%b-%Y')
        except:
            pass
        # get page
        try:
            pagemap.page = bs.get_text()
            box_1_tag = bs.find("div", class_="box_1")
            pagemap.page = box_1_tag.text
            product_main_text_tag = box_1_tag.find("div",
                                                   class_="product_main_text")
            if product_main_text_tag != None:
                pagemap.page = product_main_text_tag.text
            else:
                story_tag = box_1_tag.find("div", class_="story")
                pagemap.page = story_tag.text
        except:
            pass
        # get title
        try:
            if bs.title != None:
                pagemap.title = bs.title.text
            else:
                pagemap.title = ''
            box_1_tag = bs.find("div", class_="box_1")
            pagemap.title = box_1_tag.find("h1").text
        except:
            pass
        # get section
        try:
            box_2_tag = bs.find("div", class_="box_2")
            pagemap.section = box_2_tag.text.strip(' \t\n\r')
        except:
            pass

        data = elastic.convert_for_bulk(pagemap, 'update')
        return data
Example #3
0
    def scrape_page_map(self, sub_site, url, bs):
        id = url
        pagemap = models.PageMap()
        pagemap.page_id = id
        pagemap.site = self.site
        pagemap.sub_site = sub_site
        pagemap.url = url

        # get posted date
        # <span class="entry-date">May 23, 2017</span>
        try:
            pagemap.posted_date = datetime.today()
            entry_date_tag = bs.find("span", class_="entry-date")
            published = entry_date_tag.text
            pagemap.posted_date = datetime.strptime(published, '%B %d, %Y')
        except:
            pass
        #try:
        #    box_1_tag = bs.find("div", class_="box_1")
        #    product_info_bar_tag = box_1_tag.find("div", class_="product_info_bar")
        #    published = re.search(r'([0-9]{2}-[a-z,A-Z]{3}-[0-9]{4})', product_info_bar.text, re.MULTILINE)
        #    pagemap.posted_date = datetime.strptime(published.group(0), '%d-%b-%Y')
        #except:
        #    pass

        # get page
        # <section class="entry-content">
        try:
            pagemap.page = bs.get_text()
            entry_content_tag = bs.find("section", class_="entry-content")
            pagemap.page = entry_content_tag.text
        except:
            pass
        # get title
        # <h1 class="entry-title"></h1>  text
        try:
            if bs.title != None:
                pagemap.title = bs.title.text
            else:
                pagemap.title = ''
            entry_title_tag = bs.find("h1", class_="entry-title")
            pagemap.title = entry_title_tag.text
        except:
            pass
        # get section
        try:
            pagemap.section = sub_site
        except:
            pass

        data = elastic.convert_for_bulk(pagemap, 'update')
        return data
Example #4
0
    def scrape_page_map(self, sub_site, url, bs):
        id = url
        pagemap = models.PageMap()
        pagemap.page_id = id
        pagemap.site = self.site
        pagemap.sub_site = sub_site
        pagemap.url = url

        # get posted date
        # <span class="entry-date">May 23, 2017</span>
        try:
            pagemap.published_date = datetime.today()
            entry_date_tag = bs.find("span", class_="entry-date")
            published = entry_date_tag.text
            pagemap.published_date = datetime.strptime(published,
                                                       '%B %d, %Y').date()
        except:
            pass

        # get page
        # <section class="entry-content">
        try:
            pagemap.page = bs.get_text()
            entry_content_tag = bs.find("section", class_="entry-content")
            pagemap.page = entry_content_tag.text
        except:
            pass
        # get title
        # <h1 class="entry-title"></h1>  text
        try:
            if bs.title != None:
                pagemap.title = bs.title.text
            else:
                pagemap.title = ''
            entry_title_tag = bs.find("h1", class_="entry-title")
            pagemap.title = entry_title_tag.text
        except:
            pass
        # get section
        try:
            pagemap.section = sub_site
        except:
            pass

        data = elastic.convert_for_bulk(pagemap, 'update')
        return data
Example #5
0
    def scrape_page_map(self, sub_site, url, bs):
        id = url
        pagemap = models.PageMap()
        pagemap.page_id = id
        pagemap.site = self.site
        pagemap.sub_site = sub_site
        pagemap.url = url

        article_tag = bs.find('article')
        try:  # posted date
            published = article_tag.find('time').text
            pagemap.published_date = datetime.strptime(published,
                                                       '%d-%b-%Y').date()
        except:
            pass
        try:  # title
            if bs.title != None:
                pagemap.title = bs.title.text
            else:
                pagemap.title = article_tag.header.h1.text
        except:
            pass
        try:  # section
            if sub_site in ['Skin-care', 'Hair-care']:
                pagemap.section = article_tag.header.p.text.strip()
            else:
                pagemap.section = 'blog'
        except:
            pass
        try:  # img_src
            pagemap.img_src = article_tag.header.figure.img.attrs['src']
        except:
            pass
        try:  # page
            pagemap.page = article_tag.find('div',
                                            class_='Detail-content').text
        except:
            pass

        data = elastic.convert_for_bulk(pagemap, 'update')
        return data