def getUpdateTime(self, selector, item): item['update_time'] = -1 LogUtil.log("update_time(%d)" % item['update_time']) return
def getSystem(self, selector, item): item['system'] = "NULL" LogUtil.log("system(%s)" % item['system']) return
def getTag(self, selector, item): item['tag'] = "NULL" LogUtil.log("tag(%s)" % item['tag']) return
def getInstallCount(self, selector, item): xpath = '//div[@class="app-intro"]//span[@class="download-num"]/text()' item['install_count'] = -1L while True: eles = selector.xpath(xpath).extract() if (0 == len(eles)): break string = eles[0] nums = re.findall(r"\d+\.?\d*", string) if (0 == len(nums)): break num = float(nums[0]) if (-1 != string.find('亿')): num *= 1e8 if (-1 != string.find('万')): num *= 1e4 if (-1 != string.find('千')): num *= 1e3 item['install_count'] = long(num) break LogUtil.log("install_count(%d)" % item['install_count']) return
def getLikeCount(self, selector, item): item['like_count'] = -1L LogUtil.log("like_count(%d)" % item['like_count']) return
def getCommentCount(self, selector, item): item['comment_count'] = -1L LogUtil.log("comment_count(%d)" % item['comment_count']) return
def getSize(self, selector, item): xpath = '//div[@class="app-intro"]//span[@class="size"]/text()' eles = selector.xpath(xpath).extract() size = -1 while True : if (0 == len(eles)): break string = eles[0] nums = re.findall(r"\d+\.?\d*", string) if (0 == len(nums)): break size = float(nums[0]) if (-1 != string.find('K')): size *= 1024 if (-1 != string.find('M')): size *= 1024 * 1024 if (-1 != string.find('G')): size *= 1024 * 1024 * 1024 break item['size'] = long(size) LogUtil.log("size(%d)" % item['size']) return
def parse(self, response): selector = scrapy.Selector(response) # APP信息容器 yield self.getItem(selector, response) # 提取各类别首页链接 for url in self.getCateLink(selector): print url yield Request(url, callback=self.parse) # 提取App详情页面链接 for url in self.getAppLink(selector): print url yield Request(url, callback=self.parse) # 提取翻页链接 for url in self.getPageLink(selector, str(response.url).encode('utf-8')): print url yield Request(url, callback=self.parse) # 提取相关网页链接 for url in self.getRelateLink(selector): print url yield Request(url, callback=self.parse) # 已处理URL数目统计 self.urls_sum += 1 LogUtil.log("urls_sum(%d)" % self.urls_sum)
def parse(self, response): selector = scrapy.Selector(response) # APP信息容器 yield self.getItem(selector, response) # 已处理URL数目统计 self.urls_sum += 1 LogUtil.log("urls_sum(%d)" % self.urls_sum)
def getCategory(self, selector, item): category = "NULL" item['category'] = category LogUtil.log("category(%s)" % item['category']) return
def getCommentCount(self, selector, item): xpath = '//a[@class="item last comment-open"]/i/text()' eles = selector.xpath(xpath).extract() comment_count = -1L if (0 != len(eles)): comment_count = self.strToNum(eles[0]) item['comment_count'] = comment_count LogUtil.log("comment_count(%d)" % item['comment_count']) return
def getName(self, selector, item): xpath = '//p[@class="app-name"]/span[@class="title" and @itemprop="name"]/text()' eles = selector.xpath(xpath).extract() name = "NULL" if (0 != len(eles)): name = eles[0] item['name'] = StrUtil.delWhiteSpace(name) LogUtil.log("name(%s)" % item['name']) return
def getName(self, selector, item): xpath = '//div[@class="app-intro"]//h1[@class="app-name"]/span/text()' eles = selector.xpath(xpath).extract() name = "NULL" if (0 != len(eles)): name = eles[0] item['name'] = StrUtil.delWhiteSpace(name) LogUtil.log("name(%s)" % item['name']) return
def getVersion(self, selector, item): # xpath = '//dl[@class="infos-list"]/dd[5]/text()' xpath = u'//dl[@class="infos-list"]/dt[text() = "版本"]/following::*[1]/text()' eles = selector.xpath(xpath).extract() if (0 != len(eles)): item['version'] = StrUtil.delWhiteSpace(eles[0]) else: item['version'] = "NULL" LogUtil.log("version(%s)" % item['version']) return
def getEditorComment(self, selector, item): xpath = '//div[@class="app-detail"]//span[@class="head-content"]/text()' eles = selector.xpath(xpath).extract() editor_comment = "NULL" if (0 != len(eles)): editor_comment = eles[0] item['editor_comment'] = StrUtil.delWhiteSpace(editor_comment) LogUtil.log("editor_comment(%s)" % item['editor_comment']) return
def getSize(self, selector, item): xpath = '//meta[@itemprop="fileSize"]/@content' eles = selector.xpath(xpath).extract() size = -1L if (0 != len(eles)): size = long(eles[0]) item['size'] = size LogUtil.log("size(%d)" % item['size']) return
def getUpdateTime(self, selector, item): xpath = '//time[@itemprop="datePublished"]/text()' eles = selector.xpath(xpath).extract() update_time = -1L if (0 != len(eles)): d = datetime.datetime.strptime(eles[0], "%Y年%m月%d日") update_time = long(time.mktime(d.timetuple())) item['update_time'] = update_time LogUtil.log("update_time(%d)" % item['update_time']) return
def getDescInfo(self, selector, item): xpath = '//div[@class="app-detail"]//div[@class="brief-long"]/p//text()' eles = selector.xpath(xpath).extract() # eles = selector.xpath(xpath).xpath('string(., " ")').extract() desc_info = "NULL" if (0 != len(eles)): desc_info = " ".join(eles) item['desc_info'] = StrUtil.delWhiteSpace(desc_info) LogUtil.log("desc_info(%s)" % item['desc_info']) return
def getDescInfo(self, selector, item): xpath = '//div[@itemprop="description"]//text()' eles = selector.xpath(xpath).extract() # eles = selector.xpath(xpath).xpath('string(., " ")').extract() desc_info = "NULL" if (0 != len(eles)): desc_info = " ".join(eles) item['desc_info'] = StrUtil.delWhiteSpace(desc_info) LogUtil.log("desc_info(%s)" % item['desc_info']) return
def getFeature(self, selector, item): xpath = '//div[@class="yui3-g"]//div[@class="app-feature"]//span[@class="app-feature-detail"]//text()' item['feature'] = "NULL" while True: eles = selector.xpath(xpath).extract() item['feature'] = "-".join(filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace, eles))) break LogUtil.log("feature(%s)" % item['feature']) return
def getSystem(self, selector, item): xpath = '//dd[@itemprop="operatingSystems"]/text()' eles = selector.xpath(xpath).extract() system = "NULL" if (0 != len(eles)): pattern = re.compile('\s+') system = (re.sub(pattern, ' ', eles[0])).strip() item['system'] = system LogUtil.log("system(%s)" % item['system']) return
def getFeature(self, selector, item): xpath = '//div[@class="infors-txt"]/div[@class="title"]/ul//text()' item['feature'] = "NULL" while True: eles = selector.xpath(xpath).extract() item['feature'] = "-".join( filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace, eles))) break LogUtil.log("feature(%s)" % item['feature']) return
def getSource(self, selector, item): # xpath = '//a[@itemprop="url" and @class="dev-sites"]/span/text()' xpath = u'//dl[@class="infos-list"]/dt[text() = "来自"]/following::*[1]' eles = selector.xpath(xpath).xpath('string(.)').extract() source = "NULL" if (0 != len(eles)): pattern = re.compile('\s+') source = (re.sub(pattern, ' ', eles[0])).strip() item['source'] = source LogUtil.log("source(%s)" % item['source']) return
def getEditorComment(self, selector, item): xpath = '//dl[@class="clearfix"]/dd/p//text()' item['editor_comment'] = "NULL" while True: eles = selector.xpath(xpath).extract() eles = filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace, eles)) if (1 >= len(eles)): break item['editor_comment'] = eles[1] break LogUtil.log("editor_comment(%s)" % item['editor_comment']) return
def getTag(self, selector, item): xpath = '//div[@class="app-tags"]//text()' item['tag'] = "NULL" while True: eles = selector.xpath(xpath).extract() if (1 >= len(eles)): break item['tag'] = "-".join((filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace, eles)))[1:]) break LogUtil.log("tag(%s)" % item['tag']) return
def getSource(self, selector, item): xpath = '//div[@class="base-info"]/table/tbody/tr[1]/td[1]//text()' eles = selector.xpath(xpath).extract() item['source'] = "NULL" while True: if (1 >= len(eles)): break item['source'] = filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace, eles))[1] break LogUtil.log("source(%s)" % item['source']) return
def getScore(self, selector, item): xpath = '//dl[@class="clearfix"]/dd/div[@class="pf"]/span[@class="s-1 js-votepanel"]//text()' item['score'] = -1 while True: eles = selector.xpath(xpath).extract() eles = filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace, eles)) nums = re.findall(r"\d+\.?\d*", " ".join(eles)) if (0 == len(nums)): break item['score'] = float(nums[0]) * 10 break LogUtil.log("score(%d)" % item['score']) return
def getSource(self, selector, item): xpath = '//div[@class="app-intro"]//div[@class="origin-wrap"]//a[@class="origin"]/text()' item['source'] = "NULL" while True: eles = selector.xpath(xpath).extract() if (0 == len(eles)): break string = eles[0] item['source'] = StrUtil.delWhiteSpace(string) break LogUtil.log("source(%s)" % item['source']) return
def getUpdateTime(self, selector, item): xpath = '//div[@class="base-info"]/table/tbody/tr[1]/td[2]//text()' eles = selector.xpath(xpath).extract() item['update_time'] = -1L while True: if (1 >= len(eles)): break string = filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace, eles))[1] d = datetime.datetime.strptime(string, "%Y-%m-%d") item['update_time'] = long(time.mktime(d.timetuple())) break LogUtil.log("update_time(%d)" % item['update_time']) return
def getCategory(self, selector, item): xpath = '//div[@class="app-nav"]//a/text()' category = "NULL" while True: strings = selector.xpath(xpath).extract() if (1 >= len(strings)): break category = "-".join(map(StrUtil.delWhiteSpace, strings[1:])) break item['category'] = category LogUtil.log("category(%s)" % item['category']) return