def parse_forum(self, response): hxs = HtmlXPathSelector(response) text = hxs.select('//div[@class="forumPostText"]').extract() time = hxs.select('//div[@class="forumPostTitle"]/div[@class="floatRight"]/text()').extract() author = hxs.select('//div[@class="forumUsername"]/a/text()').extract() items = [] for i in range(0,len(text)): texts =''.join(BeautifulSoup(text[i]).findAll(text=True)) if time[i]: thetime = time[i].strip() thetime = datetime.strptime(thetime, '%d.%m.%Y %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S') else: thetime= '2012-10-14 20:24:02' if getIdentifier(texts,thetime) in self.stored: raise DropItem("Duplicate item found") else: item = CrawlerItem() item['url'] = response.url item['texts'] =texts item['time'] = thetime item['author'] = author[i] item['source'] = 'forum' item['site'] = 'Holiday Check' item['location'] = 'Schweiz' item['crawltime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') item['identifer'] = getIdentifier(texts, thetime) self.stored.append(getIdentifier(texts,thetime)) items.append(item) return items
def parse(self, response): hxs = HtmlXPathSelector(response) hasdiscus = hxs.select('//div[@class="noTalkback"]') if not hasdiscus: items = [] text = hxs.select('//div[@class="komment"]/p/span').extract() user = hxs.select('//div[@class="kommentLeft"]/h4/text()').extract() time =hxs.select('//div[@class="kommentTime"]/text()').extract() if text: for i in range(0,len(text)): dates = datetime.strptime(time[i][:-12], '%d.%m.%Y').strftime('%Y-%m-%d') texts =''.join(BeautifulSoup(text[i]).findAll(text=True)) texts = texts.strip() indeti = getIdentifier(texts,dates) item = CrawlerItem() item['url'] = response.url item['texts'] = text[i] item['time'] = dates item['author'] = user[i] item['source'] = 'Comments' item['site'] = 'Tagesanzeiger' item['location'] = 'Schweiz' item['identifer'] = indeti item['crawltime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') if (indeti not in self.stored): self.stored.append(indeti) items.append(item) return items
def parse_advisor(self, response): hxs = HtmlXPathSelector(response) text = hxs.select('//div[@id="singleLeft"]/p').extract() date = hxs.select('//span[@class="publishedDate"]/text()').extract() dates = date[0].replace('(Erstellt: ','') dates = datetime.strptime(dates[:-12], '%d.%m.%Y').strftime('%Y-%m-%d') s = "" items = [] for i in range(0,len(text)): texts = ''.join( BeautifulSoup( text [i]).findAll( text = True )) texts =texts.replace('var badword = 0;', '') texts =texts.replace('var badwordserch = 1;', '') texts =texts.replace(date[0], '') texts = texts.strip() s +=texts if s: indeti = getIdentifier(s,dates) if (indeti in self.stored): raise DropItem("Duplicate item found") else: item = CrawlerItem() item['url'] = response.url item['texts'] = s item['time'] = dates item['author'] = 'Tagi' item['source'] = 'Zeitung' item['site'] = 'Tagesanzeiger' item['location'] = 'Schweiz' item['identifer'] = indeti item['crawltime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') self.stored.append(indeti) return item