def start_requests(self): requests = [] items = [] db = MongoClient(host="localhost", port=27017) collection = db.api.quited_url for item in collection.find(): items.append(item) collection.remove() if items.__len__() > 0: for item in items: url = item["url"].encode("utf-8") types = item["type"] item_each = item["item"] need_access_each = item["need_access"] if types == 'api_page': api = API() api_item_init(api) if item_each is not None: item_filed_from_dict(api, item_each) requests.append(Request(url=url,callback=self.parse,meta={'type':'api_page','item':api},dont_filter=True)) elif types == 'followers_page': api_followers = API_Followers() api_followers_init(api_followers) if item_each is not None: item_filed_from_dict(api_followers, item_each) requests.append(Request(url=url,callback=self.api_followers_parse, meta={'type':'followers_page','item':api_followers,'need_access':need_access_each},dont_filter=True)) elif types == 'developers_page': api_developers = API_Developers() api_developers_init(api_developers) if item_each is not None: item_filed_from_dict(api_developers, item_each) requests.append(Request(url=url,callback=self.api_developers_parse, meta={'type':'developers_page','item':api_developers,'need_access':need_access_each},dont_filter=True)) elif types == 'api_summary': api = API() api_item_init(api) if item_each is not None: item_filed_from_dict(api, item_each) requests.append(Request(url=url,callback=self.api_summary_parse,meta={'type':'api_summary','item':api},dont_filter=True)) elif types == 'user_page_f': api_followers = API_Followers() api_followers_init(api_followers) if item_each is not None: item_filed_from_dict(api_followers, item_each) requests.append(Request(url=url, callback=self.user_exceed_parse, meta={'type': 'user_page_f', 'item': api_followers,'need_access':need_access_each},dont_filter=True)) elif types == 'user_page_d': api_developers = API_Developers() api_developers_init(api_developers) if item_each is not None: item_filed_from_dict(api_developers, item_each) requests.append(Request(url=url, callback=self.user_exceed_parse, meta={'type': 'user_page_d', 'item': api_developers,'need_access':need_access_each},dont_filter=True)) else: # requests.append(Request(url="http://www.programmableweb.com/api/google-app-engine",callback=self.api_summary_parse,dont_filter=True)) requests.append(Request(url="http://www.programmableweb.com/apis/directory",meta={'type':'api_page'},callback=self.parse,dont_filter=True)) return requests
def parse(self, response): sel = Selector(response) api_url_list = sel.xpath('//td[@class="views-field views-field-title col-md-3"]/a/@href').extract() for api_url in api_url_list: updated_date = sel.xpath('//td/a[@href="'+api_url+'"]/../following-sibling::td[@class="views-field views-field-created"]/text()') api_item = API() api_item_init(api_item) api_item['pweb_link'] = self.base_url+api_url.strip().encode("utf-8") item_filled(updated_date,api_item,'updated_time',True) yield Request(url=self.base_url+api_url,callback=self.api_summary_parse,meta={'item':api_item,'type':'api_summary'},dont_filter=True) next_page_urls = sel.xpath('//a[@title="Go to next page"]/@href') if next_page_urls: next_page_url = next_page_urls.extract()[0] yield Request(self.base_url+next_page_url,self.parse,meta={'type':'api_page'},dont_filter=True)