Example #1
0
 def parse_xpath(self, response, xpath):
     appItemList = []
     sel = Selector(response)
     for url in sel.xpath(xpath).extract():
         url = urljoin(response.url, url)
         log.msg("Catch an application: %s" % url, level=log.INFO)
         appItem = AppItem()
         appItem['url'] = url
         appItemList.append(appItem)
     return appItemList
Example #2
0
 def parse_xpath(self, response, xpath):
     appItemList = []
     sel = Selector(response)
     for url in response.xpath(xpath).extract():
         url = urljoin(response.url, url)
         logging.info("Catch an application: %s", url)
         appItem = AppItem()
         appItem['url'] = url
         appItemList.append(appItem)
     return appItemList
def parse_anzhi(response):
    xpath = "//div[@id='btn']/a/@onclick"
    appItemList = []
    hxs = HtmlXPathSelector(response)
    for script in hxs.select(xpath).extract():
        id = re.search(r"\d+", script).group()
        url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id, )
        appItem = AppItem()
        appItem['url'] = url
        appItemList.append(appItem)
    return appItemList
def parse_anzhi(response):
    xpath = "//div[@class='detail_down']/a/@onclick"
    appItemList = []
    sel = Selector(response)
    for script in sel.xpath(xpath).extract():
        id = re.search(r"\d+", script).group()
        url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id, )
        appItem = AppItem()
        appItem['url'] = url
        appItemList.append(appItem)
    return appItemList
Example #5
0
    def parse_xpath(self, response, xpath, key):
        appItemList = []
        name_xpath_rule = self.scrape_rules['name_xpath']
        type_xpath_rule = self.scrape_rules['type_xpath']
        size_xpath_rule = self.scrape_rules['size_xpath']
        description_xpath_rule = self.scrape_rules['description_xpath']
        version_xpath_rule = self.scrape_rules['version_xpath']
        time_xpath_rule = self.scrape_rules['time_xpath']
        versionInfo_xpath_rule = self.scrape_rules['versionInfo_xpath']
        sel = Selector(text=response.body)
        for url in sel.xpath(xpath).extract():
            url = urljoin(response.url, url)
            # log.info("Catch an application: %s" % url, level=log.INFO)
            # self.logger.info("Catch an application: %s",url)
            appItem = AppItem()
            appItem['url'] = url
            # appItemList.append(appItem)
            # appItem['app_name'] = sel.xpath(name_xpath_rule[key]).extract()[0]
            app_name = ''.join(sel.xpath(name_xpath_rule[key]).extract())
            appItem['app_name'] = ''.join(app_name.split())
            # print isinstance(appItem['app_name'], unicode)
            # # app_name.encode("utf-8")
            # print appItem['app_name']
            app_type = ''.join(sel.xpath(type_xpath_rule[key]).extract())
            appItem['app_type'] = ''.join(app_type.split())
            # app_size = ''.join(sel.xpath(size_xpath_rule[key]).extract())
            # appItem['app_size'] = ''.join(app_size.split())
            app_description = ''.join(
                sel.xpath(description_xpath_rule[key]).extract()).replace(
                    '<br />', '')
            appItem['app_description'] = ''.join(app_description.split())

            app_size = ''.join(sel.xpath(size_xpath_rule[key]).extract())
            appItem['app_size'] = ''.join(app_size.split())
            app_version = ''.join(sel.xpath(version_xpath_rule[key]).extract())
            appItem['app_version'] = ''.join(app_version.split())
            app_time = ''.join(sel.xpath(time_xpath_rule[key]).extract())
            appItem['app_time'] = ''.join(app_time.split())
            app_versionInfo = ''.join(
                sel.xpath(versionInfo_xpath_rule[key]).extract()).replace(
                    '<br />', '')
            appItem['app_versionInfo'] = ''.join(app_versionInfo.split())

            appItemList.append(appItem)
        return appItemList
    def parse_xpath(self, response, xpath):
        appItemList = []
        hxs = HtmlXPathSelector(response)
        for url in hxs.select(xpath).extract():
            url = urljoin(response.url, url)
            log.msg("Catch an application: %s" % url, level=log.INFO)
            appItem = AppItem()
            appItem['url'] = url
            appItemList.append(appItem)
        return appItemList
    
    #def parse_anzhi(self, response, xpath):
    #    appItemList = []
    #    hxs = HtmlXPathSelector(response)
    #    for script in hxs.select(xpath).extract():
    #        id = re.search(r"\d+", script).group()
    #        url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,)
    #        appItem = AppItem()
    #        appItem['url'] = url
    #        appItemList.append(appItem)
    #    return appItemList

            
Example #7
0
def parse_anzhi(self,response,key):
    xpath = "//div[@class='detail_down']/a/@onclick"
    appItemList = []
    name_xpath_rule = self.scrape_rules['name_xpath']
    type_xpath_rule = self.scrape_rules['type_xpath']
    size_xpath_rule = self.scrape_rules['size_xpath']
    description_xpath_rule = self.scrape_rules['description_xpath']
    sel = Selector(text=response.body)
    for script in sel.xpath(xpath).extract():
        id = re.search(r"\d+", script).group()
        url = "http://www.anzhi.com/dl_app.php?s=%s&n=5" % (id,)
        appItem = AppItem()
        appItem['url'] = url
        # appItemList.append(appItem)
        app_name = ''.join(sel.xpath(name_xpath_rule[key]).extract())
        appItem['app_name'] = ''.join(app_name.split())
        app_type = ''.join(sel.xpath(type_xpath_rule[key]).extract())
        appItem['app_type'] = ''.join(app_type.split())
        app_size = ''.join(sel.xpath(size_xpath_rule[key]).extract())
        appItem['app_size'] = ''.join(app_size.split())
        app_description = ''.join(sel.xpath(description_xpath_rule[key]).extract()).replace('<br />','')
        appItem['app_description'] = ''.join(app_description.split())
        appItemList.append(appItem)
    return appItemList