def parse_detail(self, response): soup = bs4.BeautifulSoup(response.text, "html.parser") platform = self.name commonname = soup.find_all( "h1", class_='app-name')[0].find('span').get_text() detailinfo = soup.find_all("div", class_='detail')[0] size = detailinfo.find_all("span", class_='size')[0].get_text() version = detailinfo.find_all("span", class_='version')[0].get_text() sizepattern = re.compile(ur'[0-9\.]+.*') versionpattern = re.compile(ur'[0-9\.]+') idpattern = re.compile(ur'[0-9]+') size = sizepattern.search(size).group() version = versionpattern.search(version).group() packagename = commonname platformid = idpattern.search(response.url).group() urllink = soup.find_all("a", class_='apk')[0]['href'] category = soup.find_all("a", attrs={'target': '_self'})[2].get_text() if platformid in self.apkbf: return self.apkbf.add(platformid) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('commonname', commonname) item.add_value('apkplaform', platform) item.add_value('apkid_specifiedbyplaform', platformid) item.add_value('category', category) item.add_value('packagename', packagename) item.add_value('size', size) item.add_value('version', version) item.add_value('urllink', urllink) item.add_value('file_urls', urllink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()
def parse_detail(self, response): soup = bs4.BeautifulSoup(response.text, 'html.parser'); idpattern = re.compile(ur'[0-9]+'); appinfo = soup.select('.app-info')[0]; apknamepattern = re.compile(ur'appdetail/.*?/'); commonname = appinfo.select('.title')[0].get_text(); category = response.meta['category']; platform = self.name; sv = appinfo.select('.dec')[0].get_text().split('|'); size = sv[0]; version = sv[1]; print(response.url); apkid = idpattern.search(response.url).group(); print(apkid); packagename = apknamepattern.search(response.url).group()[10:-1]; urllink = soup.select('.download')[0]['href']; if apkid in self.apkbf: return; self.apkbf.add(apkid); item = ItemLoader(item=ApkspiderItem(), response=response); item.add_value('apkid_specifiedbyplaform',apkid); item.add_value('commonname',commonname); item.add_value('apkplaform',platform); item.add_value('category',category); item.add_value('packagename',packagename); item.add_value('size',size); item.add_value('version',version); item.add_value('urllink',urllink); item.add_value('file_urls',urllink); item.add_value('checkpoint',self.checkpoint); yield item.load_item();
def parse_detail(self, response): packagenamepattern = re.compile(ur'/[^/]*.apk') soup = bs4.BeautifulSoup(response.text, 'html.parser') idpattern = re.compile(ur'[0-9]+') versionpattern = re.compile(ur'[0-9.]+') commonname = soup.select('dt.clearfix')[0].get_text().strip() version = versionpattern.search(commonname).group() category = response.meta['category'] msgsoup = soup.select('.msg-list')[0] msglist = msgsoup.select('li') size = msglist[0].get_text() size = size[size.find(u':') + 1:].strip() developer = msglist[1].get_text() developer = developer[developer.find(u':') + 1:].strip() updatetime = msglist[5].get_text() updatetime = updatetime[updatetime.find(u':') + 1:].strip() apkid = idpattern.search(response.url).group() urllink = self.httpprotocol + soup.select('.dl-btn')[0]['tempurl'] print(urllink) packagename = packagenamepattern.search(urllink).group() item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('commonname', commonname) item.add_value('apkplaform', self.name) item.add_value('apkid_specifiedbyplaform', apkid) item.add_value('category', category) item.add_value('developer', developer) item.add_value('packagename', packagename) item.add_value('updatetime', updatetime) item.add_value('size', size) item.add_value('version', version) item.add_value('urllink', response.url) item.add_value('file_urls', response.url) yield item.load_item()
def parse_json(self, response): categorypattern = re.compile(ur'categoryId=-?[0-9]+'); pagecontext = re.compile(ur'pageContext=-?[0-9]+'); idpattern = re.compile(ur'-?[0-9]+'); catestring = categorypattern.search(response.url).group(); pagestring = pagecontext.search(response.url).group(); cateid = idpattern.search(catestring).group(); pageid = idpattern.search(pagestring).group(); json_response = json.loads(response.body_as_unicode()); count = 0; if json_response.has_key('count'): count = int(json_response['count']); else: return; print(response.url); print(count); if count <= 0: return; objs = ""; if json_response.has_key('obj'): objs = json_response['obj']; else: return; apkplaform = 'qq'; for obj in objs: if obj['apkUrl'] in self.categorybf: continue; if obj['appId'] in self.apkbf: continue; self.apkbf.add(obj['appId']); self.categorybf.add(obj['apkUrl']); print(obj); item = ItemLoader(item=ApkspiderItem(), response=response); item.add_value("commonname",obj['appName']); item.add_value('apkplaform',apkplaform); item.add_value('apkid_specifiedbyplaform',str(obj['appId'])); item.add_value('category',obj['categoryName']); item.add_value('developer',obj['authorName']); item.add_value('packagename',obj['pkgName']); item.add_value('updatetime',obj['apkPublishTime']); item.add_value('version',obj['versionName']); item.add_value('urllink',obj['apkUrl']); item.add_value('file_urls',obj['apkUrl']); item.add_value('checkpoint',self.checkpoint); yield item.load_item(); url = self.base_cate_url%(int(response.meta['orgname']),int(cateid),int(pageid)+self.step); yield Request( url, headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"}, meta={'orgname':response.meta['orgname']}, callback=self.parse_json );
def parse_detail(self, response): versionpattern = re.compile(ur'[0-9\.]+') soup = bs4.BeautifulSoup(response.text, "html.parser") commonname = soup.select('.app-name')[0].get_text() info = soup.select('.infos-list')[0] size = info.find('dd').get_text() platform = self.name urllink = soup.find_all('a', class_='normal-dl-btn')[0] if not urllink.has_attr('href'): return urllink = urllink['href'] version = info.select('dd')[2].get_text() print(version) if versionpattern.search(version) == None: version = info.select('dd')[3].get_text() developer = info.select('.dev-sites') if len(developer) == 0: developer = "" else: developer = developer[0].get_text() permission = list() permlist = info.find_all('span', class_='perms') for perm in permlist: permission.append(perm.get_text()) category = info.find_all('a')[0].get_text() updatetime = soup.find('span', class_='update-time').get_text() timepattern = re.compile(ur'[0-9/]+') updatetime = timepattern.search(updatetime).group() packagename = response.url[response.url.rfind('/') + 1:] if packagename in self.apkbf: return self.apkbf.add(packagename) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('apkid_specifiedbyplaform', packagename) item.add_value('commonname', commonname) item.add_value('apkplaform', platform) item.add_value('category', category) item.add_value('developer', developer) item.add_value('packagename', packagename) item.add_value('updatetime', updatetime) item.add_value('size', size) item.add_value('version', version) item.add_value('permission', permission) item.add_value('urllink', urllink) item.add_value('file_urls', urllink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()
def parse_detail(self, response): soup = bs4.BeautifulSoup(response.text, "html.parser"); info = soup.select('div.app-info.flt')[0]; commonname = info.select('.title')[0].get_text(); category = response.meta['category']; platform = self.name; detailinfos = info.select('li.ul-li-detail'); if not len(detailinfos) == 4: return; size = detailinfos[0].select('span')[0].get_text(); updatetime = detailinfos[1].select('span')[0].get_text(); developer= detailinfos[2].select('span')[0].get_text(); version= detailinfos[3].select('span')[0].get_text(); permissionlist = list(); permissions = soup.select('.hidepermission')[0].select('li'); for p in permissions: if p.get_text().startswith(u'·'): permissionlist.append(p.get_text()); urllink = soup.select('a.mkapp-btn.mab-download')[0]; apkid = ""; if not urllink.has_attr("onclick"): return; urllink = urllink['onclick'].split('\''); apkid = urllink[1]; urllink = urllink[11]; urllink = urllink[:urllink.find('?sign')]; print(urllink); packagename = urllink[urllink.rfind('/')+1:]; print(packagename); if apkid in self.apkbf: return; self.apkbf.add(apkid); item = ItemLoader(item=ApkspiderItem(), response=response); item.add_value('commonname',commonname); item.add_value('apkplaform',platform); item.add_value('apkid_specifiedbyplaform',apkid); item.add_value('category',category); item.add_value('developer',developer); item.add_value('packagename',packagename); item.add_value('updatetime',updatetime); item.add_value('size',size); item.add_value('version',version); item.add_value('permission',permissionlist); item.add_value('urllink',urllink); item.add_value('file_urls',urllink); item.add_value('checkpoint', self.checkpoint); yield item.load_item();
def parse_detail(self, response): urlpattern = re.compile(ur'url=.*') apkidpattern = re.compile(ur'soft_id/[0-9]+') numpattern = re.compile(ur'[0-9]+') packagenamepattern = re.compile(ur'/[^/]*\.apk') soup = bs4.BeautifulSoup(response.text, 'html.parser') print(response.url) commonname = soup.select('#app-name')[0].get_text() size = soup.select('.s-3')[1].get_text() urllink = urlpattern.search( soup.select('.js-downLog.dbtn')[0]['href']).group()[4:] packagename = packagenamepattern.search(urllink).group()[1:-4] apkid = numpattern.search(apkidpattern.search( response.url).group()).group() metainfo = soup.select('.base-info')[0] metainfo = metainfo.select('td') developer = metainfo[0].get_text() developer = developer[developer.find(u':') + 1:] version = metainfo[2].get_text() version = version[version.find(u':') + 1:] updatetime = metainfo[1].get_text() updatetime = updatetime[updatetime.find(u':') + 1:] permissionlist = list() permission = soup.select('#authority-panel')[0].select( 'p')[0].get_text().split('\n') category = response.meta['category'] for perm in permission: if perm.strip().startswith(u'-'): permissionlist.append(perm.strip()) if apkid in self.apkbf: return self.apkbf.add(apkid) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('commonname', commonname) item.add_value('apkplaform', self.name) item.add_value('apkid_specifiedbyplaform', apkid) item.add_value('category', category) item.add_value('developer', developer) item.add_value('packagename', packagename) item.add_value('updatetime', updatetime) item.add_value('size', size) item.add_value('version', version) item.add_value('permission', permissionlist) item.add_value('urllink', urllink) item.add_value('file_urls', urllink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()
def parse_detail(self, response): soup = bs4.BeautifulSoup(response.text, 'html.parser') print(response.url) urllinkpattern = re.compile(ur'\'.*\'') urllink = soup.select('.download_app')[0] if not urllink.has_attr( 'onclick') or urllink['onclick'] == 'return false;': return urllink = urllink['onclick'] urllink = urllinkpattern.search(urllink).group()[1:-1] commonname = soup.select('.app-name')[0].get_text() detaillist = soup.select('.art-content') size = detaillist[2].get_text() size = size[size.find(u':') + 1:] version = detaillist[3].get_text() version = version[version.find(u':') + 1:] category = detaillist[6].get_text() category = category[category.find(u':') + 1:] packagename = response.url[response.url.rfind('/') + 1:] permissionlist = list() permissions = soup.select('.permissions-list')[0].find_all('li') for perm in permissions: permissionlist.append(perm.get_text()) if packagename in self.apkbf: return self.apkbf.add(packagename) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('apkid_specifiedbyplaform', packagename) item.add_value('commonname', commonname) item.add_value('apkplaform', self.name) item.add_value('category', category) item.add_value('packagename', packagename) item.add_value('size', size) item.add_value('version', version) item.add_value('permission', permissionlist) item.add_value('urllink', urllink) item.add_value('file_urls', urllink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()
def parse_detail(self, response): print(response.url); numpattern = re.compile(ur'[0-9]+'); soup = bs4.BeautifulSoup(response.text, 'html.parser'); appdetail = soup.select('.app_detail')[0]; commonname = appdetail.select('.detail_line')[0].select('h3')[0].get_text(); version = appdetail.select('.app_detail_version')[0].get_text(); appdetail = appdetail.select('#detail_line_ul')[0].select('li'); category = appdetail[0].get_text(); category = category[category.find(u':')+1:]; updatetime = appdetail[2].get_text(); updatetime = updatetime[updatetime.find(u':')+1:]; size = appdetail[3].get_text(); size = size[size.find(u':')+1:]; developer = appdetail[6].get_text(); developer = developer[developer.find(u':')+1:]; apkid = numpattern.search(soup.select('.detail_down')[0].select('a')[0]['onclick']).group(); dlg = self.downloadgate%int(apkid); proxy = Proxy(apkid, dlg); urllink = proxy.get_downloadaddress(); packagenamepattern = re.compile(ur'/[^/]*\.html'); packagename = packagenamepattern.search(response.url).group()[1:-5]; if apkid in self.apkbf: return; self.apkbf.add(apkid); item = ItemLoader(item=ApkspiderItem(), response=response); item.add_value('commonname',commonname); item.add_value('apkplaform',self.name); item.add_value('apkid_specifiedbyplaform',apkid); item.add_value('category',category); item.add_value('developer',developer); item.add_value('packagename',packagename); item.add_value('updatetime',updatetime); item.add_value('size',size); item.add_value('version',version); item.add_value('urllink',urllink); item.add_value('file_urls',urllink); item.add_value('checkpoint',self.checkpoint); yield item.load_item();
def parse_detail(self, response): soup = bs4.BeautifulSoup(response.text, 'html.parser') infosoup = soup.select('.info_box')[0] versionpattern = re.compile(ur'[0-9\.]+') packagenamepattern = re.compile(ur'/[^/]*.apk') commonname = infosoup.select('h1')[0].get_text() version = versionpattern.search(commonname).group() metainfolist = infosoup.select('em') category = metainfolist[0].get_text() updatetime = metainfolist[1].get_text() size = metainfolist[3].get_text() developer = metainfolist[4].get_text() urllink = soup.select('.btn_android')[0]['href'] for i in range(0, self.TRY_NUM): if not urllink.find('.apk') == -1: break proxy = Proxy(0, urllink) urllink = proxy.get_downloadaddress() idpattern = re.compile(ur'[0-9]+') apkid = idpattern.search(response.url).group() packagename = packagenamepattern.search(urllink).group()[1:-4] if apkid in self.apkbf: return self.apkbf.add(apkid) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('commonname', commonname) item.add_value('apkplaform', self.name) item.add_value('apkid_specifiedbyplaform', apkid) item.add_value('category', category) item.add_value('developer', developer) item.add_value('packagename', packagename) item.add_value('updatetime', updatetime) item.add_value('size', size) item.add_value('version', version) item.add_value('urllink', urllink) item.add_value('file_urls', urllink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()
def parse_detail(self, response): soup = bs4.BeautifulSoup(response.text, 'html.parser') appinfo = soup.select('.app-info')[0] commonname = appinfo.select('.app-title')[0].get_text() pls = soup.select('.permission-list') permissionlist = list() if not len(pls) == 0: for perm in pls[0].select('.clearfix')[0].find_all('li'): permissionlist.append(perm.get_text()) category = response.meta['category'] detail_info = soup.select('.app-detail-info')[0].select('strong') size = detail_info[1].get_text() updatetime = detail_info[0].get_text() version = detail_info[2].get_text() urllink = soup.select('.btn-install')[0]['appdownurl'] platform = self.name detailpattern = re.compile(ur'detail_[0-9]+') idpattern = re.compile(ur'[0-9]+') detailstring = detailpattern.search(response.url).group() apkid = idpattern.search(detailstring).group() packagename = commonname if apkid in self.apkbf: return print("apkid%s" % apkid) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('commonname', commonname) item.add_value('apkid_specifiedbyplaform', apkid) item.add_value('apkplaform', platform) item.add_value('category', category) item.add_value('packagename', packagename) item.add_value('updatetime', updatetime) item.add_value('size', size) item.add_value('version', version) item.add_value('permission', permissionlist) item.add_value('urllink', urllink) item.add_value('file_urls', urllink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()
def parse_download(self, response): json_response = json.loads(response.body_as_unicode()) if not json_response['code'] == 200: return urllink = json_response['value']['downloadUrl'] apkid = response.meta['packagename'] if apkid in self.apkbf: return self.apkbf.add(apkid) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('apkid_specifiedbyplaform', apkid) item.add_value('commonname', response.meta['commonname']) item.add_value('apkplaform', response.meta['platform']) item.add_value('category', response.meta['category']) item.add_value('developer', response.meta['developer']) item.add_value('packagename', response.meta['packagename']) item.add_value('updatetime', response.meta['updatetime']) item.add_value('size', response.meta['size']) item.add_value('version', response.meta['version']) item.add_value('urllink', urllink) item.add_value('file_urls', urllink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()
def parse_download(self, response): json_response = json.loads(response.body_as_unicode()) if not json_response['errno'] == 0: return downloadurl = json_response['data']['file_url'] proxy = Proxy(0, downloadurl) downloadurl = proxy.get_downloadaddress() if response.meta['appid'] in self.apkbf: return self.apkbf.add(response.meta['appid']) item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('commonname', response.meta['commonname']) item.add_value('apkplaform', self.name) item.add_value('apkid_specifiedbyplaform', response.meta['appid']) item.add_value('category', response.meta['category']) item.add_value('developer', response.meta['developer']) item.add_value('packagename', response.meta['packagename']) item.add_value('updatetime', response.meta['updatetime']) item.add_value('size', response.meta['size']) item.add_value('version', response.meta['version']) item.add_value('urllink', downloadurl) item.add_value('file_urls', downloadurl) item.add_value('checkpoint', self.checkpoint) yield item.load_item()
def parse_detail(self, response): soup = bs4.BeautifulSoup(response.text, "html.parser") downloadlinks = soup.find_all("a", class_='download') hideinfos = soup.select("div .details.preventDefault") overviews = soup.select("div .intro-titles") if not (len(overviews) == 1 and len(hideinfos) == 1 and len(downloadlinks) == 1): return downloadlink = downloadlinks[0] overview = overviews[0] commonname = overview.select("h3")[0].text.encode(encoding='UTF-8', errors='strict') # apkplaform = "xiaomi" # apkid_specifiedbyplaform = "" # category = overview.select("p.special-font.action")[0].text.encode( encoding='UTF-8', errors='strict') category = category[category.find(":") + 3:category.find("|")] # developer = overview.select("p")[0].text.encode(encoding='UTF-8', errors='strict') # packagename = "" # size = "" # version = "" # permissionlist = list() # urlink = urlparse.urljoin(self.base_url, downloadlink['href']) # #description = ""; updatetime = '' # hideinfogenes = hideinfos[0].select('ul.cf') if not len(hideinfogenes) == 1: return hideinfogene = hideinfogenes[0] generalinfos = hideinfogene.select('li') while len(generalinfos) > 1: infodes = generalinfos.pop(0).text.encode(encoding='UTF-8', errors='strict') if infodes.strip() == 'appId:': apkid_specifiedbyplaform = generalinfos.pop(0).text.encode( encoding='UTF-8', errors='strict') elif infodes.strip() == '更新时间:': updatetime = generalinfos.pop(0).text.encode(encoding='UTF-8', errors='strict') elif infodes.strip() == '包名:': packagename = generalinfos.pop(0).text.encode(encoding='UTF-8', errors='strict') elif infodes.strip() == '版本号:': version = generalinfos.pop(0).text.encode(encoding='UTF-8', errors='strict') elif infodes.strip() == '软件大小:': size = generalinfos.pop(0).text.encode(encoding='UTF-8', errors='strict') permissioninfos = hideinfos[0].select('ul.second-ul') if not len(permissioninfos) == 1: return permissions = permissioninfos[0].select('li') while len(permissions) > 0: permission = permissions.pop(0).text.encode(encoding='UTF-8', errors='strict') permission = permission[3:].strip() permissionlist.append(permission) if apkid_specifiedbyplaform in self.apkbf: return item = ItemLoader(item=ApkspiderItem(), response=response) item.add_value('commonname', commonname) item.add_value('apkplaform', apkplaform) item.add_value('apkid_specifiedbyplaform', apkid_specifiedbyplaform) item.add_value('category', category) item.add_value('developer', developer) item.add_value('packagename', packagename) item.add_value('updatetime', updatetime) item.add_value('size', size) item.add_value('version', version) item.add_value('permission', permissionlist) item.add_value('urllink', urlink) item.add_value('file_urls', urlink) item.add_value('checkpoint', self.checkpoint) yield item.load_item()