Example #1
0
    def getUpdateTime(self, selector, item):
        
        item['update_time'] = -1

        LogUtil.log("update_time(%d)" % item['update_time'])

        return
Example #2
0
    def getSystem(self, selector, item):
       
        item['system'] = "NULL"

        LogUtil.log("system(%s)" % item['system'])    

        return
Example #3
0
    def getTag(self, selector, item):
        
    	item['tag'] = "NULL"

        LogUtil.log("tag(%s)" % item['tag'])

        return
Example #4
0
    def getInstallCount(self, selector, item):
        xpath = '//div[@class="app-intro"]//span[@class="download-num"]/text()'

        item['install_count'] = -1L

        while True:
        	eles = selector.xpath(xpath).extract()
        	if (0 == len(eles)):
        		break
        	string = eles[0]
        	nums = re.findall(r"\d+\.?\d*", string)
        	if (0 == len(nums)):
        		break
        	num = float(nums[0])
        	if (-1 != string.find('亿')):
        		num *= 1e8
        	if (-1 != string.find('万')):
        		num *= 1e4
        	if (-1 != string.find('千')):
        		num *= 1e3
        	item['install_count'] = long(num)
        	break

        LogUtil.log("install_count(%d)" % item['install_count'])    

        return
Example #5
0
    def getLikeCount(self, selector, item):

        item['like_count'] = -1L

        LogUtil.log("like_count(%d)" % item['like_count'])    

        return
Example #6
0
    def getCommentCount(self, selector, item):
        
        item['comment_count'] = -1L

        LogUtil.log("comment_count(%d)" % item['comment_count'])    

        return
Example #7
0
    def getSize(self, selector, item):
        xpath = '//div[@class="app-intro"]//span[@class="size"]/text()'

        eles = selector.xpath(xpath).extract()

        size = -1

        while True :
        	if (0 == len(eles)):
        		break
        	string = eles[0]
        	nums = re.findall(r"\d+\.?\d*", string)
        	if (0 == len(nums)):
        		break
        	size = float(nums[0])
        	if (-1 != string.find('K')):
        		size *= 1024 
        	if (-1 != string.find('M')):
        		size *= 1024 * 1024
        	if (-1 != string.find('G')):
        		size *= 1024 * 1024 * 1024
        	break

        item['size'] = long(size)
        LogUtil.log("size(%d)" % item['size'])

        return
Example #8
0
    def parse(self, response):

        selector = scrapy.Selector(response)

        # APP信息容器
        yield self.getItem(selector, response)

        # 提取各类别首页链接
        for url in self.getCateLink(selector):
        	print url
        	yield Request(url, callback=self.parse)

        # 提取App详情页面链接
        for url in self.getAppLink(selector):
        	print url
        	yield Request(url, callback=self.parse)

        # 提取翻页链接
        for url in self.getPageLink(selector, str(response.url).encode('utf-8')):
        	print url
        	yield Request(url, callback=self.parse)

        # 提取相关网页链接
        for url in self.getRelateLink(selector):
        	print url
        	yield Request(url, callback=self.parse)

        # 已处理URL数目统计
        self.urls_sum += 1
        LogUtil.log("urls_sum(%d)" % self.urls_sum)
    def parse(self, response):
        selector = scrapy.Selector(response)

        # APP信息容器
        yield self.getItem(selector, response)

        # 已处理URL数目统计
        self.urls_sum += 1
        LogUtil.log("urls_sum(%d)" % self.urls_sum)
Example #10
0
    def getCategory(self, selector, item):

        category = "NULL"

        item['category'] = category

        LogUtil.log("category(%s)" % item['category'])

        return
Example #11
0
    def getCommentCount(self, selector, item):
        xpath = '//a[@class="item last comment-open"]/i/text()'

        eles = selector.xpath(xpath).extract()

        comment_count = -1L
        if (0 != len(eles)):
            comment_count = self.strToNum(eles[0])
        item['comment_count'] = comment_count

        LogUtil.log("comment_count(%d)" % item['comment_count'])

        return
Example #12
0
    def getName(self, selector, item):
        xpath = '//p[@class="app-name"]/span[@class="title" and @itemprop="name"]/text()'

        eles = selector.xpath(xpath).extract()

        name = "NULL"
        if (0 != len(eles)):
            name = eles[0]

        item['name'] = StrUtil.delWhiteSpace(name)
        LogUtil.log("name(%s)" % item['name'])

        return
Example #13
0
    def getName(self, selector, item):
        xpath = '//div[@class="app-intro"]//h1[@class="app-name"]/span/text()'

        eles = selector.xpath(xpath).extract()

        name = "NULL"
        if (0 != len(eles)):
            name = eles[0]

        item['name'] = StrUtil.delWhiteSpace(name)
        LogUtil.log("name(%s)" % item['name'])

        return
Example #14
0
    def getVersion(self, selector, item):
        # xpath = '//dl[@class="infos-list"]/dd[5]/text()'
        xpath = u'//dl[@class="infos-list"]/dt[text() = "版本"]/following::*[1]/text()'
        eles = selector.xpath(xpath).extract()

        if (0 != len(eles)):
            item['version'] = StrUtil.delWhiteSpace(eles[0])
        else:
            item['version'] = "NULL"

        LogUtil.log("version(%s)" % item['version'])

        return
Example #15
0
    def getEditorComment(self, selector, item):
        xpath = '//div[@class="app-detail"]//span[@class="head-content"]/text()'

        eles = selector.xpath(xpath).extract()

        editor_comment = "NULL"
        if (0 != len(eles)):
            editor_comment = eles[0]
        item['editor_comment'] = StrUtil.delWhiteSpace(editor_comment)

        LogUtil.log("editor_comment(%s)" % item['editor_comment'])    

        return
Example #16
0
    def getSize(self, selector, item):
        xpath = '//meta[@itemprop="fileSize"]/@content'

        eles = selector.xpath(xpath).extract()

        size = -1L
        if (0 != len(eles)):
            size = long(eles[0])

        item['size'] = size
        LogUtil.log("size(%d)" % item['size'])

        return
Example #17
0
    def getUpdateTime(self, selector, item):
        xpath = '//time[@itemprop="datePublished"]/text()'

        eles = selector.xpath(xpath).extract()

        update_time = -1L
        if (0 != len(eles)):
            d = datetime.datetime.strptime(eles[0], "%Y年%m月%d日")
            update_time = long(time.mktime(d.timetuple()))
        item['update_time'] = update_time

        LogUtil.log("update_time(%d)" % item['update_time'])

        return
Example #18
0
    def getDescInfo(self, selector, item):
        xpath = '//div[@class="app-detail"]//div[@class="brief-long"]/p//text()'

        eles = selector.xpath(xpath).extract()
        # eles = selector.xpath(xpath).xpath('string(., " ")').extract()

        desc_info = "NULL"
        if (0 != len(eles)):
            desc_info = " ".join(eles)
        item['desc_info'] = StrUtil.delWhiteSpace(desc_info)

        LogUtil.log("desc_info(%s)" % item['desc_info'])    

        return
Example #19
0
    def getDescInfo(self, selector, item):
        xpath = '//div[@itemprop="description"]//text()'

        eles = selector.xpath(xpath).extract()
        # eles = selector.xpath(xpath).xpath('string(., " ")').extract()

        desc_info = "NULL"
        if (0 != len(eles)):
            desc_info = " ".join(eles)
        item['desc_info'] = StrUtil.delWhiteSpace(desc_info)

        LogUtil.log("desc_info(%s)" % item['desc_info'])

        return
Example #20
0
    def getFeature(self, selector, item):
        xpath = '//div[@class="yui3-g"]//div[@class="app-feature"]//span[@class="app-feature-detail"]//text()'

        item['feature'] = "NULL"

        while True:
        	eles = selector.xpath(xpath).extract()
        	item['feature'] = "-".join(filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace, eles)))

        	break

        LogUtil.log("feature(%s)" % item['feature'])

        return    
Example #21
0
    def getSystem(self, selector, item):
        xpath = '//dd[@itemprop="operatingSystems"]/text()'

        eles = selector.xpath(xpath).extract()

        system = "NULL"
        if (0 != len(eles)):
            pattern = re.compile('\s+')
            system = (re.sub(pattern, ' ', eles[0])).strip()
        item['system'] = system

        LogUtil.log("system(%s)" % item['system'])

        return
Example #22
0
    def getFeature(self, selector, item):
        xpath = '//div[@class="infors-txt"]/div[@class="title"]/ul//text()'

        item['feature'] = "NULL"

        while True:
            eles = selector.xpath(xpath).extract()
            item['feature'] = "-".join(
                filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace, eles)))

            break

        LogUtil.log("feature(%s)" % item['feature'])

        return
Example #23
0
    def getSource(self, selector, item):
        # xpath = '//a[@itemprop="url" and @class="dev-sites"]/span/text()'
        xpath = u'//dl[@class="infos-list"]/dt[text() = "来自"]/following::*[1]'

        eles = selector.xpath(xpath).xpath('string(.)').extract()

        source = "NULL"
        if (0 != len(eles)):
            pattern = re.compile('\s+')
            source = (re.sub(pattern, ' ', eles[0])).strip()
        item['source'] = source

        LogUtil.log("source(%s)" % item['source'])

        return
Example #24
0
    def getEditorComment(self, selector, item):
        xpath = '//dl[@class="clearfix"]/dd/p//text()'

        item['editor_comment'] = "NULL"

        while True:
            eles = selector.xpath(xpath).extract()
            eles = filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace, eles))
            if (1 >= len(eles)):
                break
            item['editor_comment'] = eles[1]

            break

        LogUtil.log("editor_comment(%s)" % item['editor_comment'])

        return
Example #25
0
    def getTag(self, selector, item):
        xpath = '//div[@class="app-tags"]//text()'

        item['tag'] = "NULL"

        while True:
            eles = selector.xpath(xpath).extract()
            if (1 >= len(eles)):
                break
            item['tag'] = "-".join((filter(StrUtil.isEmpty,
                                           map(StrUtil.delWhiteSpace,
                                               eles)))[1:])

            break

        LogUtil.log("tag(%s)" % item['tag'])

        return
Example #26
0
    def getSource(self, selector, item):
        xpath = '//div[@class="base-info"]/table/tbody/tr[1]/td[1]//text()'

        eles = selector.xpath(xpath).extract()

        item['source'] = "NULL"

        while True:
            if (1 >= len(eles)):
                break
            item['source'] = filter(StrUtil.isEmpty,
                                    map(StrUtil.delWhiteSpace, eles))[1]

            break

        LogUtil.log("source(%s)" % item['source'])

        return
Example #27
0
    def getScore(self, selector, item):
        xpath = '//dl[@class="clearfix"]/dd/div[@class="pf"]/span[@class="s-1 js-votepanel"]//text()'

        item['score'] = -1

        while True:
            eles = selector.xpath(xpath).extract()
            eles = filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace, eles))
            nums = re.findall(r"\d+\.?\d*", " ".join(eles))
            if (0 == len(nums)):
                break
            item['score'] = float(nums[0]) * 10

            break

        LogUtil.log("score(%d)" % item['score'])

        return
Example #28
0
    def getSource(self, selector, item):
        xpath = '//div[@class="app-intro"]//div[@class="origin-wrap"]//a[@class="origin"]/text()'

        item['source'] = "NULL"

        while True:
        	eles = selector.xpath(xpath).extract()

        	if (0 == len(eles)):
        		break
        	string = eles[0]
        	item['source'] = StrUtil.delWhiteSpace(string)

        	break

        LogUtil.log("source(%s)" % item['source'])    

        return
Example #29
0
    def getUpdateTime(self, selector, item):
        xpath = '//div[@class="base-info"]/table/tbody/tr[1]/td[2]//text()'

        eles = selector.xpath(xpath).extract()

        item['update_time'] = -1L

        while True:
            if (1 >= len(eles)):
                break
            string = filter(StrUtil.isEmpty, map(StrUtil.delWhiteSpace,
                                                 eles))[1]
            d = datetime.datetime.strptime(string, "%Y-%m-%d")
            item['update_time'] = long(time.mktime(d.timetuple()))
            break

        LogUtil.log("update_time(%d)" % item['update_time'])

        return
Example #30
0
    def getCategory(self, selector, item):
        xpath = '//div[@class="app-nav"]//a/text()'

        category = "NULL"

        while True:

        	strings = selector.xpath(xpath).extract()
        	if (1 >= len(strings)):
        		break
        	category = "-".join(map(StrUtil.delWhiteSpace, strings[1:]))

        	break

        item['category'] = category

        LogUtil.log("category(%s)" % item['category'])    

        return