def parse(self, soup): """ :return: List of items with their details """ # Adds title rdata = self.scraper.archive_list.get(self.comic_id) # Parse the items here and return the content to be added to the db comic_raw = soup.find(id='cc-comic') img_src = comic_raw['src'] comic_filename = '{base}/{year}/{month}/{name}{ext}'\ .format(base=self.scraper.BASE_SAVE_DIR, year=rdata['posted_at'].year, month=rdata['posted_at'].month, name=str(rdata['posted_at']), ext=cutil.get_file_ext(img_src)) rdata.update({ 'time_collected': cutil.get_datetime(), 'file_path': self.web.download(img_src, comic_filename).replace( self.scraper.BASE_DATA_DIR + os.path.sep, ''), 'alt': comic_raw['title'] }) return rdata
def log_last_scraped(self): try: setting = db_session.query(Setting).filter(Setting.bit == 0).one() setting.comic_last_ran = cutil.get_datetime() db_session.add(setting) db_session.commit() except: logger.exception("Problem logging last comic scraped")
def parse_book(self, content): """ :return: List of items with their details """ cover_source = content.find('img', {'itemprop': 'image'})['src'].strip() try: subtitle = content.find('h3').getText().strip() except AttributeError: subtitle = None try: file_source = content.find('a', {'href': re.compile('http://filepi.com')})['href'] except (AttributeError, TypeError): file_source = None parsed_data = {'book_id': self.book_id, 'file_location': None, 'file_cover_location': None, 'file_cover_source': self.web.scraper.BASE_URL + cover_source, 'description': content.find('span', {'itemprop': 'description'}).getText().strip(), 'file_source': file_source, 'format': content.find(attrs={'itemprop': 'bookFormat'}).getText().strip().lower(), 'isbn': content.find(attrs={'itemprop': 'isbn'}).getText().strip(), 'language': content.find(attrs={'itemprop': 'inLanguage'}).getText().strip(), 'pages': content.find(attrs={'itemprop': 'numberOfPages'}).getText().strip(), 'publisher': content.find(attrs={'itemprop': 'publisher'}).getText().strip(), 'title': content.find('h1', {'itemprop': 'name'}).getText().strip(), 'subtitle': subtitle, 'year': content.find(attrs={'itemprop': 'datePublished'}).getText().strip(), 'author': content.find(attrs={'itemprop': 'author'}).getText().strip(), 'time_collected': cutil.get_datetime(), } # Download book base_filename = '{last_nums}/{book_id}/{book_id}'\ .format(last_nums=self.book_id[-2:], book_id=self.book_id) book_filename = '{base_filename}_book.{ext}'.format(base_filename=base_filename, ext=parsed_data.get('format')) cover_ext = cutil.get_file_ext(parsed_data.get('file_cover_source')) book_cover_filename = '{base_filename}_cover{ext}'.format(base_filename=base_filename, ext=cover_ext) parsed_data['file_cover_location'] = self.web.download(parsed_data.get('file_cover_source'), book_cover_filename) header = {'Referer': self.web.scraper.BASE_URL} if parsed_data.get('file_source') is not None: parsed_data['file_location'] = self.web.download(parsed_data.get('file_source'), book_filename, header=header) return parsed_data
def log_last_scraped(self): try: try: last_comic_id = min(self.comic_ids) - 1 except ValueError: last_comic_id = self.max_id setting = db_session.query(Setting).filter(Setting.bit == 0).one() setting.comic_last_id = last_comic_id setting.comic_last_ran = cutil.get_datetime() db_session.add(setting) db_session.commit() except: logger.exception("Problem logging last comic scraped")
def log_last_scraped(self): try: try: last_book_id = db_session.query(Book).order_by(Book.book_id.desc()).first() if last_book_id is not None: setting = db_session.query(Setting).filter(Setting.bit == 0).one() setting.book_last_id = last_book_id.book_id setting.book_last_ran = cutil.get_datetime() db_session.add(setting) db_session.commit() except NoResultFound: # If there is no raw data then no books were collected pass except: logger.exception("Problem logging last book scraped")
def log_last_scraped(self): try: try: last_book_id = db_session.query(Book).order_by( Book.book_id.desc()).first() if last_book_id is not None: setting = db_session.query(Setting).filter( Setting.bit == 0).one() setting.book_last_id = last_book_id.book_id setting.book_last_ran = cutil.get_datetime() db_session.add(setting) db_session.commit() except NoResultFound: # If there is no raw data then no books were collected pass except: logger.exception("Problem logging last book scraped")
def log_last_scraped(self): try: # Find the lowest comic id we did not scrape yet and start there next time if 404 in self.comic_ids: # This is never successful because it always returns a 404 page self.comic_ids.remove(404) try: last_comic_id = min(self.comic_ids) - 1 except ValueError: last_comic_id = self.max_id setting = db_session.query(Setting).filter(Setting.bit == 0).one() setting.comic_last_id = last_comic_id setting.comic_last_ran = cutil.get_datetime() db_session.add(setting) db_session.commit() except: logger.exception("Problem logging last comic scraped")
def parse(self, response): """ :return: Dict the content """ rdata = {} # Parse the items here and return the content to be added to the db logger.info("Getting comic {comic_id}-{comic_title}".format( comic_id=response.get('num'), comic_title=response.get('title'))) comic_filename = '{base}/{last_num}/{comic_id}{file_ext}'\ .format(base=self.scraper.BASE_SAVE_DIR, last_num=str(response.get('num'))[-1], comic_id=response.get('num'), file_ext=cutil.get_file_ext(response.get('img')) ) posted_at = '{year}-{month}-{day}'.format(year=response.get('year'), month=response.get('month'), day=response.get('day')) rdata = { 'comic_id': response.get('num'), 'alt': response.get('alt'), 'source_file_location': response.get('img'), 'saved_file_location': self.web.download(response.get('img'), comic_filename).replace( self.scraper.BASE_DATA_DIR + os.path.sep, ''), 'posted_at': cutil.str_to_date(posted_at, formats=["%Y-%m-%d"]), 'time_collected': cutil.get_datetime(), 'title': response.get('title'), 'transcript': response.get('transcript'), 'raw_json': json.dumps(response), } return rdata
def parse(self, soup): """ :return: List of items with their details """ # Adds title rdata = self.web.scraper.archive_list.get(self.comic_id) # Parse the items here and return the content to be added to the db img_src = "http://www.questionablecontent.net" + soup.find( 'img', {'id': 'strip'})['src'][1:] news = soup.find('div', {'id': 'news'}).text.strip() comic_filename = '{last_num}/{comic_id}.png'\ .format(last_num=str(self.comic_id)[-1], comic_id=self.comic_id) rdata.update({ 'comic_id': self.comic_id, 'news': news, 'file_path': self.web.download(img_src, comic_filename), 'time_collected': cutil.get_datetime(), }) return rdata
def parse(self, soup): """ :return: List of items with their details """ rdata = self.scraper.archive_list.get(self.whatif_id) # Parse the items here and return the content to be added to the db article = self.web.driver.find_element_by_css_selector('article.entry') rdata['question'] = soup.find('article', {'class': 'entry'}).find('p', {'id': 'question'}).get_text() whatif_filename = '{base}/{last_num}/{whatif_id}.png'\ .format(base=self.scraper.BASE_SAVE_DIR, last_num=str(self.whatif_id)[-1], whatif_id=self.whatif_id) rdata.update({'whatif_id': self.whatif_id, 'saved_file_location': self.web.screenshot(whatif_filename, element=article) .replace(self.scraper.BASE_DATA_DIR + os.path.sep, ''), 'time_collected': cutil.get_datetime(), }) return rdata
def parse_book(self, content): """ :return: List of items with their details """ cover_source = content.find('img', {'itemprop': 'image'})['src'].strip() try: subtitle = content.find('h3').getText().strip() except AttributeError: subtitle = None try: file_source = content.find( 'a', {'href': re.compile('http://filepi.com')})['href'] except (AttributeError, TypeError): file_source = None parsed_data = { 'book_id': self.book_id, 'file_location': None, 'file_cover_location': None, 'file_cover_source': self.web.scraper.BASE_URL + cover_source, 'description': content.find('span', { 'itemprop': 'description' }).getText().strip(), 'file_source': file_source, 'format': content.find(attrs={ 'itemprop': 'bookFormat' }).getText().strip().lower(), 'isbn': content.find(attrs={ 'itemprop': 'isbn' }).getText().strip(), 'language': content.find(attrs={ 'itemprop': 'inLanguage' }).getText().strip(), 'pages': content.find(attrs={ 'itemprop': 'numberOfPages' }).getText().strip(), 'publisher': content.find(attrs={ 'itemprop': 'publisher' }).getText().strip(), 'title': content.find('h1', { 'itemprop': 'name' }).getText().strip(), 'subtitle': subtitle, 'year': content.find(attrs={ 'itemprop': 'datePublished' }).getText().strip(), 'author': content.find(attrs={ 'itemprop': 'author' }).getText().strip(), 'time_collected': cutil.get_datetime(), } # Download book base_filename = '{last_nums}/{book_id}/{book_id}'\ .format(last_nums=self.book_id[-2:], book_id=self.book_id) book_filename = '{base_filename}_book.{ext}'.format( base_filename=base_filename, ext=parsed_data.get('format')) cover_ext = cutil.get_file_ext(parsed_data.get('file_cover_source')) book_cover_filename = '{base_filename}_cover{ext}'.format( base_filename=base_filename, ext=cover_ext) parsed_data['file_cover_location'] = self.web.download( parsed_data.get('file_cover_source'), book_cover_filename) header = {'Referer': self.web.scraper.BASE_URL} if parsed_data.get('file_source') is not None: parsed_data['file_location'] = self.web.download( parsed_data.get('file_source'), book_filename, header=header) return parsed_data