def crawl(self, pub_date): feed = self.parse_feed('http://www.smbc-comics.com/rss.php') for entry in feed.for_date(pub_date): title = entry.title.replace( 'Saturday Morning Breakfast Cereal - ', '') url_1 = entry.summary.src('img[src*="/comics/"]') page = self.parse_page(entry.link) url_2 = page.src('#aftercomic img') return [CrawlerImage(url_1, title), CrawlerImage(url_2)]
def crawl(self, pub_date): feed = self.parse_feed('http://www.darklegacycomics.com/feed.xml') for entry in feed.for_date(pub_date): title = entry.title page = self.parse_page(entry.link) url = page.src('img.comic-image') return CrawlerImage(url, title)
def crawl(self, pub_date): page = self.parse_page('http://www.redmeat.com/max-cannon/FreshMeat') url = page.src('.comicStrip img') title = page.alt('.comicStrip img') if pub_date.strftime('%Y-%m-%d') not in url: return return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed( 'http://www.phdcomics.com/gradfeed.php') for entry in feed.for_date(pub_date): url = entry.summary.src('img') title = entry.title.split("'")[1] return CrawlerImage(url, title)
def crawl(self, pub_date): page_url = 'http://www.gunshowcomic.com/d/%s.html' % ( pub_date.strftime('%Y%m%d'), ) page = self.parse_page(page_url) urls = page.src('img[src^="http://gunshowcomic.com/comics/"]', allow_multiple=True) return [CrawlerImage(url) for url in urls]
def crawl(self, pub_date): feed = self.parse_feed( 'http://basicinstructions.net/basic-instructions/rss.xml') for entry in feed.for_date(pub_date): url = entry.summary.src('img[src*="/storage/"][src*=".gif"]') title = entry.title return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed( 'http://www.creators.com/comics/liberty-meadows.rss') for entry in feed.for_date(pub_date): page = self.parse_page(entry.link) url = page.src('img[src*="_thumb"]').replace('thumb', 'image') return CrawlerImage(url)
def crawl(self, pub_date): feed = self.parse_feed('http://cdn.sheldoncomics.com/rss.xml') for entry in feed.for_date(pub_date): if 'Comic' not in entry.tags: continue url = entry.content0.src('img[src*="/strips/"]') return CrawlerImage(url)
def crawl(self, pub_date): page_url = 'http://penny-arcade.com/comic/%s' % ( pub_date.strftime('%Y/%m/%d'), ) page = self.parse_page(page_url) title = page.alt('#comicFrame img') url = page.src('#comicFrame img') return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://abstrusegoose.com/atomfeed.xml') for entry in feed.for_date(pub_date): url = entry.summary.src('img[src*="/strips/"]') title = entry.title text = entry.summary.title('img[src*="/strips/"]') return CrawlerImage(url, title, text)
def crawl(self, pub_date): page_url = 'http://kindofnormal.com/wumo/%s' % ( pub_date.strftime('%Y/%m/%d')) page = self.parse_page(page_url) url = page.href('link[rel="image_src"]') title = page.alt('img[src="%s"]' % url) return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://darthsanddroids.net/rss.xml') for entry in feed.for_date(pub_date): if entry.title.startswith('Episode'): url = entry.summary.src('img') title = entry.title return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://www.goominet.com/unspeakable-vault/' '?type=103&ecorss[clear_cache]=1') for entry in feed.for_date(pub_date): url = entry.summary.src('img[src*="/tx_cenostripviewer/"]') title = entry.title return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://sssscomic.com/ssss-feed.xml') for entry in feed.all(): page = self.parse_page(entry.link) url = page.src('img.comicnormal') title = entry.title.replace('SSSS page', 'Page') return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://axecop.com/feed/') for entry in feed.for_date(pub_date): title = entry.title url = entry.summary.src('img[src*="/wp-content/uploads/"]') url = url.replace('-150x150', '') return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://www.rsspect.com/rss/qwantz.xml') for entry in feed.for_date(pub_date): url = entry.summary.src('img[src*="/comics/"]') title = entry.title text = entry.summary.title('img[src*="/comics/"]') return CrawlerImage(url, title, text)
def crawl(self, pub_date): feed = self.parse_feed( 'http://www.myextralife.com/category/comic/feed/') for entry in feed.for_date(pub_date): url = entry.content0.src('img[src*="/wp-content/"]') title = entry.title return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://www.blasternation.com/rss.php') for entry in feed.for_date(pub_date): page = self.parse_page(entry.link) url = page.src('img#cc-comic') title = entry.title.replace('Blaster Nation - ', '') return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://thisishistorictimes.com/feed/') for entry in feed.for_date(pub_date): page = self.parse_page(entry.link) url = page.src('img[src*="/wp-content/uploads/"]') title = entry.title return CrawlerImage(url, title)
def crawl(self, pub_date): page_url = 'http://www.crfh.net/d2/%s.html' % ( pub_date.strftime('%Y%m%d'), ) page = self.parse_page(page_url) url = page.src('img[src*="crfh%s"]' % pub_date.strftime('%Y%m%d')) url = url.replace('\n', '') return CrawlerImage(url)
def crawl(self, pub_date): feed = self.parse_feed('http://www.goblinscomic.com/feed/') for entry in feed.for_date(pub_date): if 'Comics' not in entry.tags: continue url = entry.summary.src('img[src*="/comics/"]') return CrawlerImage(url)
def crawl(self, pub_date): feed = self.parse_feed( 'http://www.donthitsave.com/donthitsavefeed.xml') for entry in feed.for_date(pub_date): url = entry.summary.src('img') title = entry.title return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed( 'http://www.little-gamers.com/category/comic/feed') for entry in feed.for_date(pub_date): url = entry.summary.src('img') title = entry.title return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://www.tehgladiators.com/rss.xml') for entry in feed.for_date(pub_date): page = self.parse_page(entry.link) url = page.src('img[alt^="Teh Gladiators Webcomic"]') title = entry.title return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://feeds.feedburner.com/satwcomic') for entry in feed.all(): page = self.parse_page(entry.link) url = page.src('img[itemprop="image"]') title = entry.title text = page.text('span[itemprop="articleBody"]').strip() return CrawlerImage(url, title, text)
def crawl(self, pub_date): feed = self.parse_feed('http://www.megatokyo.com/rss/megatokyo.xml') for entry in feed.for_date(pub_date): if entry.title.startswith('Comic ['): title = entry.title.split('"')[1] page = self.parse_page(entry.link) url = page.src('img[src*="/strips/"]') return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://www.savagechickens.com/feed') for entry in feed.for_date(pub_date): if 'Cartoons' not in entry.tags: print 'skipping' url = entry.content0.src('img[src*="/wp-content/"]') title = entry.title return CrawlerImage(url, title)
def crawl(self, pub_date): feed = self.parse_feed('http://www.exocomics.com/feed') for entry in feed.for_date(pub_date): title = entry.title page = self.parse_page(entry.link) url = page.src('.comic img') text = page.title('.comic img') return CrawlerImage(url, title, text)
def crawl(self, pub_date): if pub_date.weekday() == 6: url = 'http://picayune.uclick.com/comics/ga/%s.jpg' % ( pub_date.strftime('%Y/ga%y%m%d'), ) else: url = 'http://images.ucomics.com/comics/ga/%s.gif' % ( pub_date.strftime('%Y/ga%y%m%d'), ) return CrawlerImage(url)
def crawl(self, pub_date): feed = self.parse_feed('http://thepunchlineismachismo.com/feed') for entry in feed.for_date(pub_date): url = entry.summary.src('img[src*="/wp-content/uploads/"]') if url is not None: url = url.replace('-150x150', '') title = entry.title return CrawlerImage(url, title)