Exemple #1
0
 def parse_forum(self, response):   
     hxs = HtmlXPathSelector(response)
     text = hxs.select('//div[@class="forumPostText"]').extract()
     
     time =  hxs.select('//div[@class="forumPostTitle"]/div[@class="floatRight"]/text()').extract()
     author = hxs.select('//div[@class="forumUsername"]/a/text()').extract()
     items = []
     for i in range(0,len(text)):
         texts =''.join(BeautifulSoup(text[i]).findAll(text=True))
         if time[i]: 
             thetime = time[i].strip()
             thetime = datetime.strptime(thetime, '%d.%m.%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')
         else:
             thetime= '2012-10-14 20:24:02'
         if getIdentifier(texts,thetime) in self.stored:
             raise DropItem("Duplicate item found")
         else:
             item = CrawlerItem()
             item['url'] = response.url
             item['texts'] =texts
             item['time'] = thetime
             item['author'] = author[i]
             item['source'] = 'forum'
             item['site'] = 'Holiday Check'
             item['location'] = 'Schweiz'
             item['crawltime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             item['identifer'] =  getIdentifier(texts, thetime)
             self.stored.append(getIdentifier(texts,thetime))
             items.append(item)
         return items
Exemple #2
0
 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     hasdiscus = hxs.select('//div[@class="noTalkback"]')
     if not hasdiscus:   
         items = []
         text = hxs.select('//div[@class="komment"]/p/span').extract()
         user = hxs.select('//div[@class="kommentLeft"]/h4/text()').extract()
         time =hxs.select('//div[@class="kommentTime"]/text()').extract()
         if text:
             for i in range(0,len(text)):
                 dates = datetime.strptime(time[i][:-12], '%d.%m.%Y').strftime('%Y-%m-%d')
                 texts =''.join(BeautifulSoup(text[i]).findAll(text=True))
                 texts = texts.strip()
                 indeti =  getIdentifier(texts,dates)
                 item = CrawlerItem()
                 item['url'] = response.url
                 item['texts'] = text[i]
                 item['time'] = dates
                 item['author'] =  user[i]
                 item['source'] = 'Comments'
                 item['site'] = 'Tagesanzeiger'
                 item['location'] = 'Schweiz'
                 item['identifer'] =  indeti
                 item['crawltime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                 if (indeti not in self.stored):
                     self.stored.append(indeti)
                     items.append(item)
             return items
Exemple #3
0
 def parse_advisor(self, response):
     hxs = HtmlXPathSelector(response)
     text = hxs.select('//div[@id="singleLeft"]/p').extract()
     date = hxs.select('//span[@class="publishedDate"]/text()').extract()
     dates = date[0].replace('(Erstellt: ','')
     dates = datetime.strptime(dates[:-12], '%d.%m.%Y').strftime('%Y-%m-%d')
     s = ""
     items = []
     for i in range(0,len(text)):
         texts = ''.join( BeautifulSoup( text [i]).findAll( text = True ))
         texts =texts.replace('var badword = 0;', '')
         texts =texts.replace('var badwordserch = 1;', '')
         texts =texts.replace(date[0], '')
         texts = texts.strip()
         s +=texts
     if s:
         indeti =  getIdentifier(s,dates)
         if (indeti in self.stored):
             raise DropItem("Duplicate item found")
         else:
             item = CrawlerItem()
             item['url'] = response.url
             item['texts'] = s 
             item['time'] = dates
             item['author'] = 'Tagi'
             item['source'] = 'Zeitung'
             item['site'] = 'Tagesanzeiger'
             item['location'] = 'Schweiz'
             item['identifer'] =  indeti
             item['crawltime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
             self.stored.append(indeti)
             return item