Exemple #1
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     ettoday_travel = EttodayTravelSpider()
     r = re.compile("^http://travel\.ettoday\.net/article/\d+\.htm$")
     for href in hrefs:
         if r.match(href) and not Database.find_dup(href):
             yield scrapy.Request(href, ettoday_travel.parse)
Exemple #2
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     letsgojp = LetsgojpSpider()
     r = re.compile("http://.*letsgojp\.com/archives/\d+/$")
     for href in hrefs:
         if r.match(href) and not Database.find_dup(href):
             yield scrapy.Request(href, letsgojp.parse)
Exemple #3
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     beauties = BeautiesSpider()
     r = re.compile("^http://beauties\.life/\?p=\d+$")
     for href in hrefs:
         if r.match(href) and not Database.find_dup(href):
             yield scrapy.Request(href, beauties.parse)
Exemple #4
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     ettoday = EttodaySpider()
     r = re.compile("^http://www\.ettoday\.net/news/\d+/\d+\.htm$")
     for href in hrefs:
         if r.match(href) and not Database.find_dup(href):
             yield scrapy.Request(href, callback=ettoday.parse)
Exemple #5
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     qq = QqSpider()
     r = re.compile("^http://news\.qq\.com/a/\d+/\d+\.htm$")
     for href in hrefs:
         if r.match(href) and not Database.find_dup(href):
             # print href
             yield scrapy.Request(href, callback=qq.parse)
Exemple #6
0
 def parse(self, response):
     hrefs = response.xpath(
         '//div[contains(@class, "post")]/h3[contains(@class, "post-title")]/a/@href'
     ).extract()
     kkday = KKdaySpider()
     for href in hrefs:
         if not Database.find_dup(href):
             yield scrapy.Request(href, callback=kkday.parse)
Exemple #7
0
 def parse(self, response):
     hrefs = response.xpath(
         '//div[@class="main_content"]/a/@href').extract()
     storm = StormSpider()
     for _href in hrefs:
         if _href:
             href = "http://www.storm.mg" + _href
             if not Database.find_dup(href):
                 yield scrapy.Request(href, callback=storm.parse)
Exemple #8
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     chinatimes = ChinatimesSpider()
     r = re.compile("/(realtimenews|newspapers)/\d+-\d+$")
     for _href in hrefs:
         if r.match(_href):
             href = "http://www.chinatimes.com" + _href
             if not Database.find_dup(href):
                 yield scrapy.Request(href, callback=chinatimes.parse)
Exemple #9
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     juksy = JuksySpider()
     r = re.compile("^/archives/\d+$")
     for _href in hrefs:
         if r.match(_href):
             href = "https://www.juksy.com" + _href
             if not Database.find_dup(href):
                 yield scrapy.Request(href, callback=juksy.parse)
Exemple #10
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     setn = SetnSpider()
     r = re.compile("^/News\.aspx\?NewsID=\d+$")
     for _href in hrefs:
         if r.match(_href):
             href = "http://www.setn.com" + _href
             if not Database.find_dup(href):
                 yield scrapy.Request(href, callback=setn.parse)
Exemple #11
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     bomb01 = Bomb01Spider()
     r = re.compile("^/article/\d+$")
     for _href in hrefs:
         if r.match(_href):
             href = "https://www.bomb01.com" + _href
             if not Database.find_dup(href):
                 yield scrapy.Request(href, bomb01.parse)
Exemple #12
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     ettoday_sports = EttodaySportsSpider()
     r = re.compile("^/news/\d+$")
     for _href in hrefs:
         if r.match(_href):
             href = "http://sports.ettoday.net" + _href
             if not Database.find_dup(href):
                 yield scrapy.Request(href, callback=ettoday_sports.parse)
Exemple #13
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     cdnews = CdnewsSpider()
     r = re.compile("^docDetail\.jsp\?coluid=\d+&docid=\d+$")
     for _href in hrefs:
         if r.match(_href):
             href = "http://www.cdnews.com.tw/cdnews_site/" + _href
             if not Database.find_dup(href):
                 yield scrapy.Request(href, callback=cdnews.parse)
Exemple #14
0
 def parse(self, response):
     hrefs = response.xpath(
         '//div[contains(@class, "abdominis")]/ul/li[contains(@class, "rtddt")]/a/@href'
     ).extract()
     appledaily = AppledailySpider()
     for _href in hrefs:
         href = "http://www.appledaily.com.tw" + _href
         if not Database.find_dup(href):
             yield scrapy.Request(href, callback=appledaily.parse)
Exemple #15
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     udn_style = UdnStyleSpider()
     r = re.compile("^/style/story/\d+/\d+$")
     for _href in hrefs:
         if r.match(_href):
             href = "http://style.udn.com" + _href
             if not Database.find_dup(href):
                 yield scrapy.Request(href, callback=udn_style.parse)
Exemple #16
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     mobile01 = Mobile01Spider()
     r = re.compile("^newsdetail/\d+/.*$")
     for _href in hrefs:
         if r.match(_href):
             href = "http://www.mobile01.com/" + _href
             if not Database.find_dup(href):
                 yield scrapy.Request(href, callback=mobile01.parse)
Exemple #17
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     udn_health = UdnHealthSpider()
     r = re.compile("^/health/story/\d+/\d+$")
     for _href in hrefs:
         if r.match(_href):
             href = "http://health.udn.com" + _href
             if not Database.find_dup(href):
                 yield scrapy.Request(href, udn_health.parse)
Exemple #18
0
 def parse(self, response):
     hrefs = response.xpath('//a/@href').extract()
     stheadline = StheadlineSpider()
     r = re.compile("^(http://std.stheadline.com/daily/)?news-content\.php\?id=\d+&target=\d+$")
     for _href in hrefs:
         if r.match(_href):
             href = _href
             if not href.startswith("http"):
                 href = "http://std.stheadline.com/daily/" + href
             if not Database.find_dup(href):
                 yield scrapy.Request(href, stheadline.parse)