def parse(self, response): if response.url.endswith('.html'): item_loader = ApkInfoLoader(item=ApkInfoItem(), response=response) item_loader.add_value('task_id', settings['SPIDER_TASK'].get('baidu', 0)) item_loader.add_value('app_refer_url', response.url) # 下载量 item_loader.add_xpath('app_dl_count', '//span[@class="download-num"]/text()') # 应用类型 item_loader.add_xpath('app_category', '//div[@class="nav"]/span[5]/a[@target="_self"]/text()') # 应用名 item_loader.add_xpath('app_name', '//div[@class="area-one-setup"]/span/@data_name') # 版本 item_loader.add_xpath('app_version', '//div[@class="area-one-setup"]/span/@data_versionname') # 包名 item_loader.add_xpath('app_package', '//div[@class="area-one-setup"]/span/@data_package') # 下载地址 item_loader.add_xpath('app_dl_url', '//div[@class="area-one-setup"]/span/@data_url') yield item_loader.load_item() sel = Selector(response) links = sel.xpath('//a/@href').extract() urls = normalize_and_dup(self.domain, links) for url in urls: if same_domain(url, self.domain): yield Request(url, callback=self.parse)
def parse(self, response): sel = Selector(response) if response.url.find('detail_') != -1: item_loader = ApkInfoLoader(item=ApkInfoItem(), response=response) item_loader.add_value('task_id', settings['SPIDER_TASK'].get('pp',0)) item_loader.add_value('app_refer_url', response.url) # 下载量 item_loader.add_xpath('app_dl_count', '//div[@class="app-downs"]/text()') # 应用名 item_loader.add_xpath('app_name', '//div[@class="app-title ellipsis"]/text()') # 应用类型 item_loader.add_xpath('app_category', '//div[@class="crumb"]/a[2]/text()') # 版本 item_loader.add_xpath('app_version', '//div[@class="app-detail-info"]/p[2]/span[1]/strong/text()') # 包名 item_loader.add_xpath('app_package', '//div[@class="detail-side"]/@data-stat-exp') # 下载地址 item_loader.add_xpath('app_dl_url', '//a[@class="btn-install large-btn"]/@appdownurl') yield item_loader.load_item() links = sel.xpath('//a/@href').extract() select_links = [link for link in links if link.find('android') != -1] urls = normalize_and_dup(self.domain, select_links) for url in urls: if same_domain(url, self.domain): yield Request(url, callback=self.parse)
def parse(self, response): if response.url.startswith('{}/app/'.format(self.domain)): item_loader = ApkInfoLoader(item=ApkInfoItem(), response=response) item_loader.add_value('task_id', settings['SPIDER_TASK'].get('appchina', 0)) item_loader.add_value('app_refer_url', response.url) # 下载量 item_loader.add_xpath('app_dl_count', '//span[@class="app-statistic"]/text()') # 应用类型 item_loader.add_xpath( 'app_category', '//div[@class="breadcrumb centre-content"]/a[3]/text()') # 应用名 item_loader.add_xpath( 'app_name', '//div[@class="download-button"]/a/@meta-name') # 版本 item_loader.add_xpath( 'app_version', '//div[@class="download-button"]/a/@meta-versionname') # 包名 item_loader.add_xpath( 'app_package', '//div[@class="download-button"]/a/@meta-packagename') # 下载地址 item_loader.add_xpath( 'app_dl_url', '//div[@class="download-button"]/a/@meta-url') yield item_loader.load_item() sel = Selector(response) links = sel.xpath('//a/@href').extract() urls = normalize_and_dup(self.domain, links) for url in urls: if same_domain(url, self.domain): yield Request(url, callback=self.parse)
def parse(self, response): if response.url.startswith('{}/app'.format(self.domain)): item_loader = ApkInfoLoader(item=ApkInfoItem(), response=response) item_loader.add_value('task_id', settings['SPIDER_TASK'].get('huawei', 0)) item_loader.add_value('app_refer_url', response.url) # 下载量 item_loader.add_xpath('app_dl_count', '//span[@class="grey sub"]/text()') # 应用类型 item_loader.add_value('app_category', 'N/A') # 应用名 item_loader.add_xpath('app_name', '//a[@class="title"]/text()') # 版本 item_loader.add_xpath( 'app_version', '//ul[@class="app-info-ul nofloat"]/li[4]/span/text()') # 包名 item_loader.add_xpath( 'app_package', '//a[@class="mkapp-btn mab-download"]/@onclick') # 下载地址 item_loader.add_xpath( 'app_dl_url', '//a[@class="mkapp-btn mab-download"]/@onclick') yield item_loader.load_item() sel = Selector(response) links = sel.xpath('//a/@href').extract() urls = normalize_and_dup(self.domain, links) for url in urls: if same_domain(url, self.domain): yield Request(url, callback=self.parse)
def parse(self, response): if response.url.startswith('{}/apps/'.format(self.domain)): item_loader = ApkInfoLoader(item=ApkInfoItem(), response=response) item_loader.add_value('task_id', settings['SPIDER_TASK'].get('wandoujia', 0)) item_loader.add_value('app_refer_url', response.url) # 下载量 item_loader.add_xpath('app_dl_count', '//div[@class="num-list"]/span/i/@content') # 应用类型 item_loader.add_xpath( 'app_category', '//div[@class="infos"]/dl/dd[2]/a[1]/text()') # 应用名 item_loader.add_xpath( 'app_name', '//div[@class="app-info"]/p[@class="app-name"]/span/text()') # 版本 item_loader.add_xpath('app_version', '//div[@class="infos"]/dl/dd[5]/text()') # # 包名 item_loader.add_value('app_package', response.url.split('apps/')[1]) # 下载地址 item_loader.add_xpath('app_dl_url', '//div[@class="qr-info"]/a/@href') yield item_loader.load_item() sel = Selector(response) links = sel.xpath('//a/@href').extract() urls = normalize_and_dup(self.domain, links) for url in urls: if same_domain(url, self.domain) and url.find('/binding') == -1: yield Request(url, callback=self.parse)
def parse(self, response): if response.url.startswith('{}/details?id='.format(self.domain)): item_loader = ApkInfoLoader(item=ApkInfoItem(), response=response) item_loader.add_value('task_id', settings['SPIDER_TASK'].get('xiaomi', 0)) item_loader.add_value('app_refer_url', response.url) # 下载量 item_loader.add_value('app_dl_count', 0) # 应用类型 item_loader.add_xpath( 'app_category', '//div[@class="bread-crumb"]/ul/li[2]/a/text()') # 应用名 item_loader.add_xpath('app_name', '//div[@class="intro-titles"]/h3/text()') # 版本 item_loader.add_xpath( 'app_version', '//div[@class="details preventDefault"]/ul/li[4]/text()') # 包名 item_loader.add_xpath( 'app_package', '//div[@class="details preventDefault"]/ul/li[8]/text()') # 下载地址 item_loader.add_xpath( 'app_dl_url', '//div[@class="details preventDefault"]/ul/li[10]/text()') yield item_loader.load_item() sel = Selector(response) links = sel.xpath('//a[not(@class="download")]/@href').extract() urls = normalize_and_dup(self.domain, links) for url in urls: if same_domain(url, self.domain): yield Request(url, callback=self.parse)
def parse(self, response): if re.findall(r'{}/android-\d+.html'.format(self.domain), response.url): item_loader = ApkInfoLoader(item=ApkInfoItem(), response=response) item_loader.add_value('task_id', settings['SPIDER_TASK'].get('mumayi', 0)) item_loader.add_value('app_refer_url', response.url) # 下载量 item_loader.add_value('app_dl_count', 0) # 应用类型 item_loader.add_xpath( 'app_category', '//div[@class="place10 fl hidden sb_w"]/a[3]/text()') # 应用名 item_loader.add_xpath( 'app_name', '//div[@class="place10 fl hidden sb_w"]/span/text()') # 版本 item_loader.add_xpath( 'app_version', '//div[@class="place10 fl hidden sb_w"]/span/text()') # 包名 item_loader.add_xpath('app_package', '//ul[@class="author"]/li[2]/text()') # 下载地址 item_loader.add_xpath('app_dl_url', '//a[@id="downurl"]/@href') yield item_loader.load_item() sel = Selector(response) links = sel.xpath('//a[not(@class="download")]/@href').extract() urls = normalize_and_dup(self.domain, links) for url in urls: if same_domain(url, self.domain) and url.find('/android') != -1: yield Request(url, callback=self.parse)
def parse(self, response): if response.url.startswith('{}/detail/index/soft_id'.format( self.domain)): item_loader = ApkInfoLoader(item=ApkInfoItem(), response=response) item_loader.add_value('task_id', settings['SPIDER_TASK'].get('qihu360', 0)) item_loader.add_value('app_refer_url', response.url) # 下载量 item_loader.add_xpath( 'app_dl_count', '//div[@id="app-info-panel"]/div/dl/dd/div/span[3]/text()') # 应用类型 item_loader.add_value('app_category', 'N/A') # 应用名 item_loader.add_value( 'app_name', re.findall("'sname': '(.*)'", response.text)[0]) # 版本 item_loader.add_xpath( 'app_version', '//div[@class="base-info"]/table/tbody/tr[2]/td[1]/text()') # 包名 item_loader.add_value( 'app_package', re.findall("'pname': \"(.*)\"", response.text)[0]) # 下载地址 item_loader.add_value( 'app_dl_url', re.findall("'downloadUrl': '(.*)'", response.text)[0]) yield item_loader.load_item() sel = Selector(response) links = sel.xpath('//a/@href').extract() urls = normalize_and_dup(self.domain, links) for url in urls: if same_domain(url, self.domain): if url.startswith('{}/detail/index/soft_id'.format( self.domain)) and url.find('?recrefer') != -1: url = url[0:url.find('?recrefer')] yield Request(url, callback=self.parse)
def parse(self, response): sel = Selector(response) if response.url.startswith( 'http://www.anzhi.com/pkg') or response.url.startswith( 'http://www.anzhi.com/soft'): item_loader = ApkInfoLoader(item=ApkInfoItem(), response=response) item_loader.add_value('task_id', settings['SPIDER_TASK'].get('anzhi', 0)) item_loader.add_value('app_refer_url', response.url) # 下载量 item_loader.add_xpath( 'app_dl_count', '//ul[@id="detail_line_ul"]/li[2]/span/text()') # 应用类型 item_loader.add_xpath('app_category', '//div[@class="title"]/h2/a/text()') # 应用名 item_loader.add_xpath('app_name', '//div[@class="title"]/h3/text()') # 版本 item_loader.add_xpath( 'app_version', '//div[@class="detail_description"]/div[@class="detail_line"]/span[@class="app_detail_version"]/text()' ) # 包名 item_loader.add_xpath( 'app_package', '//div[@class="detail_icon"]/ul/li[2]/a/@href') # 下载地址 item_loader.add_xpath( 'app_dl_url', '//div[@class="detail_other"]/div[@class="detail_down"]/a/@onclick' ) yield item_loader.load_item() links = sel.xpath('//a/@href').extract() urls = normalize_and_dup(self.domain, links) for url in urls: # if same_domain(url, self.domain) and url.find("author_list.php?auth=") == -1: if same_domain(url, self.domain): yield Request(url, callback=self.parse)
def parse(self, response): if response.url.find(r'detail.htm?apkName=') != -1: item_loader = ApkInfoLoader(item=ApkInfoItem(), response=response) item_loader.add_value('task_id', settings['SPIDER_TASK'].get('myapp', 0)) item_loader.add_value('app_refer_url', response.url) # 下载量 item_loader.add_xpath('app_dl_count', '//div[@class="det-ins-num"]/text()') # 应用类型 item_loader.add_xpath('app_category', '//a[@class="det-type-link"]/text()') # 应用名 item_loader.add_xpath( 'app_name', '//div[@class="det-ins-btn-box"]/a[@class="det-ins-btn"]/@appname' ) # 版本 item_loader.add_xpath( 'app_version', '//div[@data-modname="appOthInfo"]/div[@class="det-othinfo-data"]/text()' ) # 包名 item_loader.add_xpath( 'app_package', '//div[@class="det-ins-btn-box"]/a[@class="det-ins-btn"]/@apk') # 下载地址 item_loader.add_xpath( 'app_dl_url', '//div[@class="det-ins-btn-box"]/a[@class="det-ins-btn"]/@ex_url' ) yield item_loader.load_item() sel = Selector(response) links = sel.xpath('//a/@href').extract() urls = normalize_and_dup(self.domain, links) for url in urls: if same_domain(url, self.domain): yield Request(url, callback=self.parse)