def parse(site, website_timstamp, response): page = etree.HTML(response) entries = page.xpath('//div[@class="entryContainer"]') articles = [] for i in xrange(0,len(entries)): print i _item = ArticleListItem() title = entries[i].xpath('//div[@id="entry-title"]//h1[@class="entryTitle"]//a')[i].text url = entries[i].xpath('//div[@id="entry-title"]//h1[@class="entryTitle"]//a')[i].attrib['href'] img = entries[i].xpath('//div[@class="entryMeta"]//img')[i] headImg = img.attrib['src'] author_time = entries[i].xpath('//div[@class="entryMeta"]//span')[i].text.split('|') author = author_time[0].strip() created = author_time[1].strip() abstract = entries[i].xpath('//div[@class="blog_description"]')[i].text.strip() _item['title']=title _item['author']=author _item['headImg']=headImg _item['abstract']=abstract _item['url']=url _item['site']=site _item['isContentDownload']=False _item['created']=created articles.append(_item) cur = Date.str_to_timestamp(created) if website_timstamp >= cur: # 比较当前抓取的列表页更帖的时间与上一次该站点最新更帖时间 print articles print '=================================cur',Date.timestamp_to_str(cur),'website_timestamp',Date.timestamp_to_str(website_timstamp) return articles print '========================================' print articles print '========================================' return articles
def get_site_timestamp(self, site): for site in self.collection.find({"url":site}): return Date.str_to_timestamp(site['updated'])