Ejemplo n.º 1
0
    def select_from_xpath(self, xpath, item_rule, item_body, response, item_dict):
        '''
            按照xpath的规则提取一个项目的值
            argv:
                xpath:提取用的xpath规则
                item_rule:项目提取规则
                item_body:html
                response:http响应
                item_dict:字典结构的item
            return:
                提取获得的数据
        '''
        item_dom = item_body.select(xpath)
        #print item_dom
        if item_rule['XMLPathSelectType'] == ConfConstants.XMLPathSelectType.OnlyOne:
            item_dom = item_dom[:1]

        if item_rule['XMLPathType'] in [ConfConstants.XMLPathType.InnerText,
                                        ConfConstants.XMLPathType.InnerTextWithPic,
                                        ConfConstants.XMLPathType.InnerDateTime]:#内部文本

            if item_rule['XMLPathType'] == ConfConstants.XMLPathType.InnerTextWithPic:#带图片的内部文本
                item_dict['image_urls'] = item_dom.select('.//img/@src').extract()
            item_text = "".join(item_dom.select('.//text()').extract())
            item_text = "\n".join([line.strip() for line in item_text.split("\n") if line.strip()])
            item_value = item_text
            if item_rule['XMLPathType'] == ConfConstants.XMLPathType.InnerDateTime:
                try:
                    item_value = Utils.format_datetime(item_value.decode(response.encoding))
                except:
                    item_value = Utils.format_datetime(item_value)
        else:
            item_value = " ".join(item_dom.extract())
        return item_value
Ejemplo n.º 2
0
def load_site_conf(site):
    #pid_file = file("%s.pid" % site, "w+")
    #pid_file.write(os.getpid())
    #pid_file.close()
    Utils.settings = get_project_settings()
    sites_rule = {}
    conf_dict = xmltodict.parse(file(r"%s" % (site)).read())
    sites_rule = conf_dict['SiteRule']['Sites']['Site']
    if not isinstance(sites_rule, list):
        sites_rule = [sites_rule]
    Conf.conf_dict = conf_dict
    Utils.conf_dict = conf_dict
    Conf.sites_rule = sites_rule
    Conf.ua = Conf.conf_dict['SiteRule'].get('UserAgent', '')
    Utils.get_mongodb_client()
    db = Utils.get_db()
    res = list(db.select(Utils.settings['MYSQL_TASKSTATUS_TABLE'], what="id", where="uuid=$uuid", vars={"uuid":conf_dict['SiteRule'].get('Uuid', "")}))
    Conf.uuid = int(conf_dict['SiteRule'].get('Uuid', 0))
    Conf.task_id = int(res[0]['id']) if len(res) else int(conf_dict['SiteRule']['TaskId'])
    Conf.pid = int(conf_dict['SiteRule']['TaskId'])
    Conf.output_files = {
        ConfConstants.Output.XML:Conf.conf_dict['SiteRule']['XMLFileName'],
        ConfConstants.Output.JSON:Conf.conf_dict['SiteRule']['JsonFileName'],
        ConfConstants.Output.MongoDB:Conf.conf_dict['SiteRule']['TableName'],
        ConfConstants.Output.Hadoop:Conf.conf_dict['SiteRule']['Hadoop']
    }
    Utils.daemon = ServiceDaemon()