def parse(self, response): d = listparser.parse(response.body) feeds = d.feeds for feed in feeds: item = PodsearchbotItem() item['link'] = feed.url yield item
def parse_podcast_page(self, response): hxs = HtmlXPathSelector(response) podcast_url_xpath = "//table[@class='entry']//tr[1]/td/a[1]/@href" podcast_link = hxs.select(podcast_url_xpath).extract() try: item = PodsearchbotItem() item['link'] = podcast_link[1] except exceptions.IndexError: return yield item
def parse(self, response): hxs = HtmlXPathSelector(response) #podcast_urls_xpath = "/opml/body/outline/outline/@url" podcast_urls_xpath = "//outline/outline/@url" links = hxs.select(podcast_urls_xpath).extract() for link in links: if link.startswith('/'): link = self._baseUrl + link item = PodsearchbotItem() item['link'] = link yield item
def parse_podcast_page(self, response): hxs = HtmlXPathSelector(response) podcast_url_xpath = "/html/body/div[@class='container']/div[@id='column']/div[@id='podcast']/div[@id='podcast_details']/div[@class='konafilter']/div[@class='pf_box_header right nomobile']/ul[@class='chicklets nomobile']/li[3]/a/@href" podcast_link = hxs.select(podcast_url_xpath).extract() if not podcast_link: return if podcast_link[0] == "#": return item = PodsearchbotItem() item['link'] = podcast_link[0] yield item
def parse_podcast_page(self, response): hxs = HtmlXPathSelector(response) item = PodsearchbotItem() podcast_url_xpath = "/html/body/div[@class='container_20']/div[@id='teasertitle']/div[@class='teasertitle']/a/@href" link = hxs.select(podcast_url_xpath).extract()[0] if link.startswith('/'): link = self._baseUrl + link if link.startswith(self._baseUrl + '/podcast_url'): try: link = self.getContentLocation(link) except exceptions.KeyError: # broken link pass # return item['link'] = link yield item
def parse_podcast_page(self, response): hxs = HtmlXPathSelector(response) item = PodsearchbotItem() podcast_url_xpath = "/html/body/div[@id='page']/div[@id='content_home']/div[@id='content_podcast_col']/table/tbody/tr/td/table/tbody/tr/td[@class='feed_headbox'][1]/h1[@class='feed_head']/a/@href" try: link = hxs.select(podcast_url_xpath).extract()[0] if link.startswith('/'): link = self._baseUrl + link if link.startswith(self._baseUrl + '/feed_url'): try: link = self.getContentLocation(link) except exceptions.KeyError: # broken link pass # return except exceptions.IndexError: # no link pass item['link'] = link yield item
def parse_podcast_page(self, response): hxs = HtmlXPathSelector(response) item = PodsearchbotItem() try: podcast_url_xpath = "//div[@id='content']//a[5]/@href" link = hxs.select(podcast_url_xpath).extract()[0] if not link.startswith('/community/map;show=') and \ not link.startswith('http://podster.de/view/'): item['link'] = link except IndexError: pass try: podcast_url_xpath = "//div[@id='content']//a[4]/@href" link = hxs.select(podcast_url_xpath).extract()[0] if not link.startswith('/community/map;show=') and \ not link.startswith('http://podster.de/view/'): item['link'] = link except IndexError: pass try: podcast_url_xpath = "//div[@id='content']//div[@class='boxcontent']/a[2]/@href" link = hxs.select(podcast_url_xpath).extract()[0] if not link.startswith('/community/map;show=') and \ not link.startswith('http://podster.de/view/'): item['link'] = link except IndexError: pass try: link = item['link'] except KeyError: print(( 'PodsterDe: WARNING: The page %s did not contain a link to a feed.' % response.url)) return yield item