def Crawl_Sport_Title(response): _title = [] _link = [] item = DmozItem() hxs = HtmlXPathSelector(response) item['title'] = hxs.select("//div[@class='dd_bt']//a/text()").extract() item['link'] = hxs.select("//div[@class='dd_bt']//a/@href").extract() dataOptionGet = DataOptionGet() latest_link = dataOptionGet.get_Last_Link_Sport() for i in range(len(item['title'])): if "http://www.chinanews.com"+str(item['link'][i]) == latest_link: break _title.append(item['title'][i]) _link.append(item['link'][i]) if len(_link) > 0: dataOptionSave = DataOptionSave() dataOptionSave.Sport_Title_Save(_link,_title)
def Crawl_Entertainment_Title(response): _title = [] _link = [] item = DmozItem() hxs = HtmlXPathSelector(response) item['title'] = hxs.select("//div[@class=\"news-item img-news-item\"]//h2//a/text()").extract() item['link'] = hxs.select("//div[@class=\"news-item img-news-item\"]//h2//a/@href").extract() dataOptionGet = DataOptionGet() latest_link = dataOptionGet.get_Last_Link_ET() for i in range(len(item['title'])): if str(item['link'][i]) == latest_link: break _title.append(item['title'][i]) _link.append(item['link'][i]) if len(_link) > 0: dataOptionSave = DataOptionSave() dataOptionSave.Entertainment_Title_Save(_link,_title)
def Crawl_Community_Title(response): _title = [] _link = [] item = DmozItem() hxs = HtmlXPathSelector(response) item['title'] = hxs.select("//div[@class=\"newsList\"]//ul//li//a/text()").extract() item['link'] = hxs.select("//div[@class=\"newsList\"]//ul//li//a/@href").extract() dataOptionGet = DataOptionGet() latest_link = dataOptionGet.get_Last_Link_CO() for i in range(len(item['title'])): if str(item['link'][i]) == latest_link: break _title.append(item['title'][i]) _link.append(item['link'][i]) if len(_link) > 0: dataOptionSave = DataOptionSave() dataOptionSave.Comu_Title_Save(_link,_title)
def Crawl_Inter_Title(response): _title = [] _link = [] item = DmozItem() hxs = HtmlXPathSelector(response) sites = hxs.select('//a') dataOptionGet = DataOptionGet() latest_link = dataOptionGet.get_Last_Link_IN() for site in sites: item['title'] = site.select("//a[@target='_blank' and @class='linkto']/text()").extract() item['link'] = site.select("//a[@target='_blank' and @class='linkto']/@href").extract() break for i in range(len(item['title'])): if "http://news.qq.com"+str(item['link'][i]) == latest_link: break _title.append(item['title'][i]) _link.append(item['link'][i]) if len(_link) > 0: dataOptionSave = DataOptionSave() dataOptionSave.Inter_Title_Save(str(_link), str(_title))