Example #1
0
    def execute(self, task):
        ele = task.crawlerEle
        tagName = task.crawlerEle.tag
        value = ''
        try:
            if tagName == datanode.ATTR:

                extractXpath = ele.attrib[datanode.EXTRACT_XPATH] if ele.attrib.has_key(
                    datanode.EXTRACT_XPATH) else None
                extractRule = ele.attrib[datanode.EXTRACT_RULE] if ele.attrib.has_key(datanode.EXTRACT_RULE) else None
                isLast = ele.attrib[datanode.IS_LAST] if ele.attrib.has_key(datanode.IS_LAST) else "false"
                name = ele.attrib[datanode.NAME] if ele.attrib.has_key(datanode.NAME) else None

                if extractXpath != None and extractXpath != '':
                    value = parseutil.extractValueByXpath(extractXpath, task.frontier.getNameSpace(), task.htmlNode)

                    extractRuleStr = ele.attrib[datanode.EXTRACT_RULE_STR] if ele.attrib.has_key(
                        datanode.EXTRACT_RULE_STR) else None
                    value = parseutil.extractValueByRule(extractRuleStr,
                                                         value) if extractRuleStr != None and extractRuleStr != '' else value
                else:
                    if extractRule != None and extractRule != '':
                        value = parseutil.extractValueByRule(extractRule, task.htmlNode.tostring())

                task.parentNode.is_last = (1 if isLast == 'true' else 0)

                self.saveAttr(task, value)

        except Exception, e:
            msg = "executing ExtractProcess has occurred exception, attrname is : %s" % name
            print e, msg
            logger.error(msg)
Example #2
0
    def execute(self, task):
        ele = task.crawlerEle
        paginateXpath = ele.attrib[datanode.PAGINATE_XPATH] if ele.attrib.has_key(datanode.PAGINATE_XPATH) else None
        if paginateXpath == None or task.hasPagiNate:
            return
        try:
            htmlNode = task.htmlNode
            if None != htmlNode:
                loopEles = parseutil.selectNodes(paginateXpath, task.frontier.getNameSpace(), htmlNode)
                paginateMaxXpath = ele.attrib[datanode.PAGINATE_MAX_XPATH] if ele.attrib.has_key(
                    datanode.PAGINATE_MAX_XPATH) else None
                paginateMaxRule = ele.attrib[datanode.PAGINATE_MAX_RULE] if ele.attrib.has_key(
                    datanode.PAGINATE_MAX_RULE) else None

                paginateUrlXpath = ele.attrib[datanode.PAGINATE_URL_XPATH] if ele.attrib.has_key(
                    datanode.PAGINATE_URL_XPATH) else None
                paginateUrlRule = ele.attrib[datanode.PAGINATE_URL_RULE] if ele.attrib.has_key(
                    datanode.PAGINATE_URL_RULE) else None

                maxPage = 0
                url = ''
                for child in loopEles:
                    if None != paginateMaxXpath and '' != paginateMaxXpath:
                        maxPage = parseutil.extractValueByXpath(paginateMaxXpath, task.frontier.getNameSpace(), child)
                        if None != paginateMaxRule and '' != paginateMaxRule:
                            maxPage = parseutil.extractValueByRule(paginateMaxRule, maxPage)
                    else:
                        maxPage = parseutil.extractValueByRule(paginateMaxRule, child.tostring())

                    if None != paginateUrlXpath and '' != paginateUrlXpath:
                        url = parseutil.extractValueByXpath(paginateUrlXpath, task.frontier.getNameSpace(), child)
                        if None != paginateUrlRule and '' != paginateUrlRule:
                            url = parseutil.extractValueByRule(paginateUrlRule, url)
                    else:
                        if None != paginateUrlRule and '' != paginateUrlRule:
                            url = parseutil.extractValueByRule(paginateUrlRule, child.tostring())

                if ('' != maxPage and int(maxPage) > 0) and url != '':

                    for i in range(2, int(maxPage) + 1, 1):
                        nextPageUrl = "http://place.qyer.com" + url + str(i)
                        childTask = Task(task.getFrontier())
                        childTask.setCrawlerEle(task.getCrawlerEle())
                        if task.htmlNode is not None:
                            childTask.htmlNode = task.htmlNode

                        childTask.nextCrawlerUrl = nextPageUrl
                        childTask.parentId = task.parentId
                        childTask.parentNode = task.parentNode
                        childTask.hasPagiNate = True
                        childTask.getFrontier().addTask(childTask)
        except Exception, e:
            print e, "executing PaginateProcess has occurred exception"
Example #3
0
    def execute(self, task):
        #获取抓取URL
        try:
            crawlerUrl = task.nextCrawlerUrl
            if crawlerUrl == None or crawlerUrl == '':
                crawlerEle = task.crawlerEle

                if crawlerEle != None:
                    crawlerUrl = crawlerEle.attrib[datanode.URL] if crawlerEle.attrib.has_key(datanode.URL) else None

                urlXpath = crawlerEle.attrib[datanode.URL_XPATH] if crawlerEle.attrib.has_key(
                    datanode.URL_XPATH) else None
                urlRule = crawlerEle.attrib[datanode.URL_RULE] if crawlerEle.attrib.has_key(datanode.URL_RULE) else None

                if urlXpath != None and urlXpath != '':
                    crawlerUrl = parseutil.extractValueByXpath(urlXpath, task.frontier.getNameSpace(), task.htmlNode)

                    if urlRule != None and urlRule != '':
                        crawlerUrl = parseutil.extractValueByRule(urlRule, crawlerUrl)

                #如果xpath无法获取,则再次尝试正则处理
                #TODO

                #设定下一次抓取URL
                task.nextCrawlerUrl = crawlerUrl
        except Exception, e:
            print e
            print "execute UrlPreProcess has an exception"
Example #4
0
    def execute(self, task):
        #获取抓取URL
        try:
            crawlerUrl = task.nextCrawlerUrl
            if crawlerUrl == None or crawlerUrl == '':
                crawlerEle = task.crawlerEle

                if crawlerEle != None:
                    crawlerUrl = crawlerEle.attrib[
                        datanode.URL] if crawlerEle.attrib.has_key(
                            datanode.URL) else None

                urlXpath = crawlerEle.attrib[
                    datanode.URL_XPATH] if crawlerEle.attrib.has_key(
                        datanode.URL_XPATH) else None
                urlRule = crawlerEle.attrib[
                    datanode.URL_RULE] if crawlerEle.attrib.has_key(
                        datanode.URL_RULE) else None

                if urlXpath != None and urlXpath != '':
                    crawlerUrl = parseutil.extractValueByXpath(
                        urlXpath, task.frontier.getNameSpace(), task.htmlNode)

                    if urlRule != None and urlRule != '':
                        crawlerUrl = parseutil.extractValueByRule(
                            urlRule, crawlerUrl)

                #如果xpath无法获取,则再次尝试正则处理
                #TODO

                #设定下一次抓取URL
                task.nextCrawlerUrl = crawlerUrl
        except Exception, e:
            print e
            print "execute UrlPreProcess has an exception"
Example #5
0
    def execute(self, task):
        ele = task.crawlerEle
        tagName = task.crawlerEle.tag
        value = ''
        try:
            if tagName == datanode.ATTR:

                extractXpath = ele.attrib[
                    datanode.EXTRACT_XPATH] if ele.attrib.has_key(
                        datanode.EXTRACT_XPATH) else None
                extractRule = ele.attrib[
                    datanode.EXTRACT_RULE] if ele.attrib.has_key(
                        datanode.EXTRACT_RULE) else None
                isLast = ele.attrib[datanode.IS_LAST] if ele.attrib.has_key(
                    datanode.IS_LAST) else "false"
                name = ele.attrib[datanode.NAME] if ele.attrib.has_key(
                    datanode.NAME) else None

                if extractXpath != None and extractXpath != '':
                    value = parseutil.extractValueByXpath(
                        extractXpath, task.frontier.getNameSpace(),
                        task.htmlNode)

                    extractRuleStr = ele.attrib[
                        datanode.EXTRACT_RULE_STR] if ele.attrib.has_key(
                            datanode.EXTRACT_RULE_STR) else None
                    value = parseutil.extractValueByRule(
                        extractRuleStr, value
                    ) if extractRuleStr != None and extractRuleStr != '' else value
                else:
                    if extractRule != None and extractRule != '':
                        value = parseutil.extractValueByRule(
                            extractRule, task.htmlNode.tostring())

                task.parentNode.is_last = (1 if isLast == 'true' else 0)

                self.saveAttr(task, value)

        except Exception, e:
            msg = "executing ExtractProcess has occurred exception, attrname is : %s" % name
            print e, msg
            logger.error(msg)
Example #6
0
# -*- encoding: utf-8 -*-
from util import parseutil
import re







if __name__ == "__main__":
    str  = "http://maps.google.com/maps/api/staticmap?size=270x180&markers=icon:http://static.qyer.com/images/place/icon_coord_current.png|-80.417343,77.116013&sensor=false"
    regex = "[\w:\/\.\?x=&_]+\|([\-0-9,\.]+)&.*"
    value = parseutil.extractValueByRule(regex,str)
    print "execute result is %s " % value

    str = "啊rrrrr33中国"
    regex = "[\D]+(\d+).*"
    value = parseutil.extractValueByRule(regex,str)
    print "execute result is %s " % value

    str = "...38:"
    regex = "[\D]+(\d+)[\D]*"
    value = parseutil.extractValueByRule(regex,str)
    print "execute result is %s " % value

Example #7
0
    def execute(self, task):
        ele = task.crawlerEle
        paginateXpath = ele.attrib[
            datanode.PAGINATE_XPATH] if ele.attrib.has_key(
                datanode.PAGINATE_XPATH) else None
        if paginateXpath == None or task.hasPagiNate:
            return
        try:
            htmlNode = task.htmlNode
            if None != htmlNode:
                loopEles = parseutil.selectNodes(paginateXpath,
                                                 task.frontier.getNameSpace(),
                                                 htmlNode)
                paginateMaxXpath = ele.attrib[
                    datanode.PAGINATE_MAX_XPATH] if ele.attrib.has_key(
                        datanode.PAGINATE_MAX_XPATH) else None
                paginateMaxRule = ele.attrib[
                    datanode.PAGINATE_MAX_RULE] if ele.attrib.has_key(
                        datanode.PAGINATE_MAX_RULE) else None

                paginateUrlXpath = ele.attrib[
                    datanode.PAGINATE_URL_XPATH] if ele.attrib.has_key(
                        datanode.PAGINATE_URL_XPATH) else None
                paginateUrlRule = ele.attrib[
                    datanode.PAGINATE_URL_RULE] if ele.attrib.has_key(
                        datanode.PAGINATE_URL_RULE) else None

                maxPage = 0
                url = ''
                for child in loopEles:
                    if None != paginateMaxXpath and '' != paginateMaxXpath:
                        maxPage = parseutil.extractValueByXpath(
                            paginateMaxXpath, task.frontier.getNameSpace(),
                            child)
                        if None != paginateMaxRule and '' != paginateMaxRule:
                            maxPage = parseutil.extractValueByRule(
                                paginateMaxRule, maxPage)
                    else:
                        maxPage = parseutil.extractValueByRule(
                            paginateMaxRule, child.tostring())

                    if None != paginateUrlXpath and '' != paginateUrlXpath:
                        url = parseutil.extractValueByXpath(
                            paginateUrlXpath, task.frontier.getNameSpace(),
                            child)
                        if None != paginateUrlRule and '' != paginateUrlRule:
                            url = parseutil.extractValueByRule(
                                paginateUrlRule, url)
                    else:
                        if None != paginateUrlRule and '' != paginateUrlRule:
                            url = parseutil.extractValueByRule(
                                paginateUrlRule, child.tostring())

                if ('' != maxPage and int(maxPage) > 0) and url != '':

                    for i in range(2, int(maxPage) + 1, 1):
                        nextPageUrl = "http://place.qyer.com" + url + str(i)
                        childTask = Task(task.getFrontier())
                        childTask.setCrawlerEle(task.getCrawlerEle())
                        if task.htmlNode is not None:
                            childTask.htmlNode = task.htmlNode

                        childTask.nextCrawlerUrl = nextPageUrl
                        childTask.parentId = task.parentId
                        childTask.parentNode = task.parentNode
                        childTask.hasPagiNate = True
                        childTask.getFrontier().addTask(childTask)
        except Exception, e:
            print e, "executing PaginateProcess has occurred exception"