def parse(self, soup):
        """
        :return: List of items with their details
        """
        # Adds title
        rdata = self.scraper.archive_list.get(self.comic_id)

        # Parse the items here and return the content to be added to the db
        comic_raw = soup.find(id='cc-comic')
        img_src = comic_raw['src']

        comic_filename = '{base}/{year}/{month}/{name}{ext}'\
                         .format(base=self.scraper.BASE_SAVE_DIR,
                                 year=rdata['posted_at'].year,
                                 month=rdata['posted_at'].month,
                                 name=str(rdata['posted_at']),
                                 ext=cutil.get_file_ext(img_src))
        rdata.update({
            'time_collected':
            cutil.get_datetime(),
            'file_path':
            self.web.download(img_src, comic_filename).replace(
                self.scraper.BASE_DATA_DIR + os.path.sep, ''),
            'alt':
            comic_raw['title']
        })

        return rdata
    def log_last_scraped(self):
        try:
            setting = db_session.query(Setting).filter(Setting.bit == 0).one()
            setting.comic_last_ran = cutil.get_datetime()

            db_session.add(setting)
            db_session.commit()

        except:
            logger.exception("Problem logging last comic scraped")
    def parse_book(self, content):
        """
        :return: List of items with their details
        """
        cover_source = content.find('img', {'itemprop': 'image'})['src'].strip()
        try:
            subtitle = content.find('h3').getText().strip()
        except AttributeError:
            subtitle = None

        try:
            file_source = content.find('a', {'href': re.compile('http://filepi.com')})['href']
        except (AttributeError, TypeError):
            file_source = None

        parsed_data = {'book_id': self.book_id,
                       'file_location': None,
                       'file_cover_location': None,
                       'file_cover_source': self.web.scraper.BASE_URL + cover_source,
                       'description': content.find('span', {'itemprop': 'description'}).getText().strip(),
                       'file_source': file_source,
                       'format': content.find(attrs={'itemprop': 'bookFormat'}).getText().strip().lower(),
                       'isbn': content.find(attrs={'itemprop': 'isbn'}).getText().strip(),
                       'language': content.find(attrs={'itemprop': 'inLanguage'}).getText().strip(),
                       'pages': content.find(attrs={'itemprop': 'numberOfPages'}).getText().strip(),
                       'publisher': content.find(attrs={'itemprop': 'publisher'}).getText().strip(),
                       'title': content.find('h1', {'itemprop': 'name'}).getText().strip(),
                       'subtitle': subtitle,
                       'year': content.find(attrs={'itemprop': 'datePublished'}).getText().strip(),
                       'author': content.find(attrs={'itemprop': 'author'}).getText().strip(),
                       'time_collected': cutil.get_datetime(),
                       }

        # Download book
        base_filename = '{last_nums}/{book_id}/{book_id}'\
                        .format(last_nums=self.book_id[-2:], book_id=self.book_id)

        book_filename = '{base_filename}_book.{ext}'.format(base_filename=base_filename,
                                                            ext=parsed_data.get('format'))
        cover_ext = cutil.get_file_ext(parsed_data.get('file_cover_source'))
        book_cover_filename = '{base_filename}_cover{ext}'.format(base_filename=base_filename,
                                                                  ext=cover_ext)
        parsed_data['file_cover_location'] = self.web.download(parsed_data.get('file_cover_source'),
                                                               book_cover_filename)

        header = {'Referer': self.web.scraper.BASE_URL}
        if parsed_data.get('file_source') is not None:
            parsed_data['file_location'] = self.web.download(parsed_data.get('file_source'),
                                                             book_filename,
                                                             header=header)

        return parsed_data
    def log_last_scraped(self):
        try:
            try:
                last_comic_id = min(self.comic_ids) - 1
            except ValueError:
                last_comic_id = self.max_id

            setting = db_session.query(Setting).filter(Setting.bit == 0).one()
            setting.comic_last_id = last_comic_id
            setting.comic_last_ran = cutil.get_datetime()

            db_session.add(setting)
            db_session.commit()

        except:
            logger.exception("Problem logging last comic scraped")
    def log_last_scraped(self):
        try:
            try:
                last_book_id = db_session.query(Book).order_by(Book.book_id.desc()).first()
                if last_book_id is not None:
                    setting = db_session.query(Setting).filter(Setting.bit == 0).one()
                    setting.book_last_id = last_book_id.book_id
                    setting.book_last_ran = cutil.get_datetime()

                    db_session.add(setting)
                    db_session.commit()
            except NoResultFound:
                # If there is no raw data then no books were collected
                pass

        except:
            logger.exception("Problem logging last book scraped")
    def log_last_scraped(self):
        try:
            try:
                last_book_id = db_session.query(Book).order_by(
                    Book.book_id.desc()).first()
                if last_book_id is not None:
                    setting = db_session.query(Setting).filter(
                        Setting.bit == 0).one()
                    setting.book_last_id = last_book_id.book_id
                    setting.book_last_ran = cutil.get_datetime()

                    db_session.add(setting)
                    db_session.commit()
            except NoResultFound:
                # If there is no raw data then no books were collected
                pass

        except:
            logger.exception("Problem logging last book scraped")
    def log_last_scraped(self):
        try:
            # Find the lowest comic id we did not scrape yet and start there next time
            if 404 in self.comic_ids:
                # This is never successful because it always returns a 404 page
                self.comic_ids.remove(404)
            try:
                last_comic_id = min(self.comic_ids) - 1
            except ValueError:
                last_comic_id = self.max_id

            setting = db_session.query(Setting).filter(Setting.bit == 0).one()
            setting.comic_last_id = last_comic_id
            setting.comic_last_ran = cutil.get_datetime()

            db_session.add(setting)
            db_session.commit()

        except:
            logger.exception("Problem logging last comic scraped")
    def parse(self, response):
        """
        :return: Dict the content
        """
        rdata = {}
        # Parse the items here and return the content to be added to the db
        logger.info("Getting comic {comic_id}-{comic_title}".format(
            comic_id=response.get('num'), comic_title=response.get('title')))

        comic_filename = '{base}/{last_num}/{comic_id}{file_ext}'\
                         .format(base=self.scraper.BASE_SAVE_DIR,
                                 last_num=str(response.get('num'))[-1],
                                 comic_id=response.get('num'),
                                 file_ext=cutil.get_file_ext(response.get('img'))
                                 )
        posted_at = '{year}-{month}-{day}'.format(year=response.get('year'),
                                                  month=response.get('month'),
                                                  day=response.get('day'))
        rdata = {
            'comic_id':
            response.get('num'),
            'alt':
            response.get('alt'),
            'source_file_location':
            response.get('img'),
            'saved_file_location':
            self.web.download(response.get('img'), comic_filename).replace(
                self.scraper.BASE_DATA_DIR + os.path.sep, ''),
            'posted_at':
            cutil.str_to_date(posted_at, formats=["%Y-%m-%d"]),
            'time_collected':
            cutil.get_datetime(),
            'title':
            response.get('title'),
            'transcript':
            response.get('transcript'),
            'raw_json':
            json.dumps(response),
        }

        return rdata
    def parse(self, soup):
        """
        :return: List of items with their details
        """
        # Adds title
        rdata = self.web.scraper.archive_list.get(self.comic_id)

        # Parse the items here and return the content to be added to the db
        img_src = "http://www.questionablecontent.net" + soup.find(
            'img', {'id': 'strip'})['src'][1:]
        news = soup.find('div', {'id': 'news'}).text.strip()

        comic_filename = '{last_num}/{comic_id}.png'\
                          .format(last_num=str(self.comic_id)[-1],
                                  comic_id=self.comic_id)
        rdata.update({
            'comic_id': self.comic_id,
            'news': news,
            'file_path': self.web.download(img_src, comic_filename),
            'time_collected': cutil.get_datetime(),
        })

        return rdata
Exemple #10
0
    def parse(self, soup):
        """
        :return: List of items with their details
        """
        rdata = self.scraper.archive_list.get(self.whatif_id)

        # Parse the items here and return the content to be added to the db
        article = self.web.driver.find_element_by_css_selector('article.entry')

        rdata['question'] = soup.find('article', {'class': 'entry'}).find('p', {'id': 'question'}).get_text()

        whatif_filename = '{base}/{last_num}/{whatif_id}.png'\
                          .format(base=self.scraper.BASE_SAVE_DIR,
                                  last_num=str(self.whatif_id)[-1],
                                  whatif_id=self.whatif_id)

        rdata.update({'whatif_id': self.whatif_id,
                      'saved_file_location': self.web.screenshot(whatif_filename, element=article)
                                                     .replace(self.scraper.BASE_DATA_DIR + os.path.sep, ''),
                      'time_collected': cutil.get_datetime(),
                      })

        return rdata
    def parse_book(self, content):
        """
        :return: List of items with their details
        """
        cover_source = content.find('img',
                                    {'itemprop': 'image'})['src'].strip()
        try:
            subtitle = content.find('h3').getText().strip()
        except AttributeError:
            subtitle = None

        try:
            file_source = content.find(
                'a', {'href': re.compile('http://filepi.com')})['href']
        except (AttributeError, TypeError):
            file_source = None

        parsed_data = {
            'book_id':
            self.book_id,
            'file_location':
            None,
            'file_cover_location':
            None,
            'file_cover_source':
            self.web.scraper.BASE_URL + cover_source,
            'description':
            content.find('span', {
                'itemprop': 'description'
            }).getText().strip(),
            'file_source':
            file_source,
            'format':
            content.find(attrs={
                'itemprop': 'bookFormat'
            }).getText().strip().lower(),
            'isbn':
            content.find(attrs={
                'itemprop': 'isbn'
            }).getText().strip(),
            'language':
            content.find(attrs={
                'itemprop': 'inLanguage'
            }).getText().strip(),
            'pages':
            content.find(attrs={
                'itemprop': 'numberOfPages'
            }).getText().strip(),
            'publisher':
            content.find(attrs={
                'itemprop': 'publisher'
            }).getText().strip(),
            'title':
            content.find('h1', {
                'itemprop': 'name'
            }).getText().strip(),
            'subtitle':
            subtitle,
            'year':
            content.find(attrs={
                'itemprop': 'datePublished'
            }).getText().strip(),
            'author':
            content.find(attrs={
                'itemprop': 'author'
            }).getText().strip(),
            'time_collected':
            cutil.get_datetime(),
        }

        # Download book
        base_filename = '{last_nums}/{book_id}/{book_id}'\
                        .format(last_nums=self.book_id[-2:], book_id=self.book_id)

        book_filename = '{base_filename}_book.{ext}'.format(
            base_filename=base_filename, ext=parsed_data.get('format'))
        cover_ext = cutil.get_file_ext(parsed_data.get('file_cover_source'))
        book_cover_filename = '{base_filename}_cover{ext}'.format(
            base_filename=base_filename, ext=cover_ext)
        parsed_data['file_cover_location'] = self.web.download(
            parsed_data.get('file_cover_source'), book_cover_filename)

        header = {'Referer': self.web.scraper.BASE_URL}
        if parsed_data.get('file_source') is not None:
            parsed_data['file_location'] = self.web.download(
                parsed_data.get('file_source'), book_filename, header=header)

        return parsed_data