예제 #1
0
    def getPageLink(self, selector, prefix):
        xpath = '//div[@class="pager"]//a/@href'

        eles = selector.xpath(xpath).extract()
        for i in range(len(eles)):
        	eles[i] = StrUtil.completeURL(prefix, eles[i])

        return filter(StrUtil.isEmpty, eles)
예제 #2
0
    def getTag(self, selector, item):
        xpath = '//div[@class="side-tags clearfix"]/div/a/text()'

        tag = ""
        tags = selector.xpath(xpath).extract()
        for i in range(len(tags)):
            if (i):
                tag = tag + "-" + StrUtil.delWhiteSpace(tags[i])
            else:
                tag = StrUtil.delWhiteSpace(tags[i])

        if (0 != len(tag)):
            item['tag'] = tag
        else:
            item['tag'] = "NULL"

        LogUtil.log("tag(%s)" % item['tag'])

        return
예제 #3
0
    def getCategory(self, selector, item):
        xpath = '//dd[@class="tag-box"]/a/text()'

        category = ""
        categories = selector.xpath(xpath).extract()
        for i in range(len(categories)):
            if (i):
                category = category + "-" + StrUtil.delWhiteSpace(
                    categories[i])
            else:
                category = StrUtil.delWhiteSpace(categories[i])

        if (0 != len(category)):
            item['category'] = category
        else:
            item['category'] = "NULL"

        LogUtil.log("category(%s)" % item['category'])

        return
예제 #4
0
    def getEditorComment(self, selector, item):
        xpath = '//div[@class="app-detail"]//span[@class="head-content"]/text()'

        eles = selector.xpath(xpath).extract()

        editor_comment = "NULL"
        if (0 != len(eles)):
            editor_comment = eles[0]
        item['editor_comment'] = StrUtil.delWhiteSpace(editor_comment)

        LogUtil.log("editor_comment(%s)" % item['editor_comment'])    

        return
예제 #5
0
    def getName(self, selector, item):
        xpath = '//div[@class="app-intro"]//h1[@class="app-name"]/span/text()'

        eles = selector.xpath(xpath).extract()

        name = "NULL"
        if (0 != len(eles)):
            name = eles[0]

        item['name'] = StrUtil.delWhiteSpace(name)
        LogUtil.log("name(%s)" % item['name'])

        return
예제 #6
0
    def getVersion(self, selector, item):
        # xpath = '//dl[@class="infos-list"]/dd[5]/text()'
        xpath = u'//dl[@class="infos-list"]/dt[text() = "版本"]/following::*[1]/text()'
        eles = selector.xpath(xpath).extract()

        if (0 != len(eles)):
            item['version'] = StrUtil.delWhiteSpace(eles[0])
        else:
            item['version'] = "NULL"

        LogUtil.log("version(%s)" % item['version'])

        return
예제 #7
0
    def getName(self, selector, item):
        xpath = '//p[@class="app-name"]/span[@class="title" and @itemprop="name"]/text()'

        eles = selector.xpath(xpath).extract()

        name = "NULL"
        if (0 != len(eles)):
            name = eles[0]

        item['name'] = StrUtil.delWhiteSpace(name)
        LogUtil.log("name(%s)" % item['name'])

        return
예제 #8
0
    def getDescInfo(self, selector, item):
        xpath = '//div[@class="app-detail"]//div[@class="brief-long"]/p//text()'

        eles = selector.xpath(xpath).extract()
        # eles = selector.xpath(xpath).xpath('string(., " ")').extract()

        desc_info = "NULL"
        if (0 != len(eles)):
            desc_info = " ".join(eles)
        item['desc_info'] = StrUtil.delWhiteSpace(desc_info)

        LogUtil.log("desc_info(%s)" % item['desc_info'])    

        return
예제 #9
0
    def getDescInfo(self, selector, item):
        xpath = '//div[@itemprop="description"]//text()'

        eles = selector.xpath(xpath).extract()
        # eles = selector.xpath(xpath).xpath('string(., " ")').extract()

        desc_info = "NULL"
        if (0 != len(eles)):
            desc_info = " ".join(eles)
        item['desc_info'] = StrUtil.delWhiteSpace(desc_info)

        LogUtil.log("desc_info(%s)" % item['desc_info'])

        return
예제 #10
0
    def getSource(self, selector, item):
        xpath = '//div[@class="app-intro"]//div[@class="origin-wrap"]//a[@class="origin"]/text()'

        item['source'] = "NULL"

        while True:
        	eles = selector.xpath(xpath).extract()

        	if (0 == len(eles)):
        		break
        	string = eles[0]
        	item['source'] = StrUtil.delWhiteSpace(string)

        	break

        LogUtil.log("source(%s)" % item['source'])    

        return
예제 #11
0
    def loadStartURLs(self):
        prefix = "http://www.wandoujia.com/apps/"
        # 文件URL
        file = open('data/apps.txt', 'r')
        for line in file:
            self.start_urls.append(prefix + StrUtil.delWhiteSpace(line))
        file.close()

        # 固定URL
        self.start_urls.append("http://www.wandoujia.com/apps")  # 应用首页
        self.start_urls.append("http://www.wandoujia.com/category/app")  # 安卓软件
        self.start_urls.append(
            "http://www.wandoujia.com/category/game")  # 安卓游戏
        # self.start_urls.append("http://www.wandoujia.com/apps/air.jp.funkyland.AliceHouse2") # 旧版应用
        # self.start_urls.append("http://www.wandoujia.com/apps/com.tencent.mm") # 新版应用
        # self.start_urls.append("http://www.wandoujia.com/category/408") # 旅游出行首页

        return