def parse_item(self, response): item = Item() word = response.meta['word'] fields = json.loads(self.conf.get("fields")) l = ItemLoader(item, response) if fields.get("fields", "") == "": logging.error(u"内容解析未得到!!!") return l.load_item() item.fields["url"] = Field() item.fields["spider_jobid"] = Field() l.add_value("url", response.url) l.add_value("spider_jobid", self.spider_jobid) item.fields['word'] = Field() l.add_value('word', word) # 加载动态库字段建立Field,xpath规则 (方法一) for k in loadMySQL(self.name_spider)['fields'].keys(): if fields.get("fields", "") == "": logging.error(u"内容解析未得到!!!") return l.load_item() if fields.get("fields").get(k) != None: item.fields[k] = Field() if fields.get("fields").get(k).keys()[0] == "xpath": l.add_xpath( k, u"{}".format(fields.get("fields").get(k).get("xpath")), MapCompose(unicode.strip)) elif fields.get("fields").get(k).keys()[0] == "value": l.add_value( k, u"{}".format(fields.get("fields").get(k).get("value"))) return l.load_item()
def parse(self, response): item = Item() sel = Selector(response) fields = json.loads(self.conf.get("fields")) rules = json.loads(self.conf.get("rules")) loops = rules.get("rules").get("rules_listxpath") if fields.get("fields", "") == "": logging.error(u"内容解析未得到!!!") yield item item.fields["url"] = Field() item.fields["spider_jobid"] = Field() item["spider_jobid"] = self.spider_jobid item.fields['word'] = Field() item['word'] = response.meta.get("word") # 加载动态库字段建立Field,xpath规则 (方法一) for loop in sel.xpath("{}".format(loops)): item['url'] = loop.xpath(u"{}".format(fields.get("fields").get("url").get("xpath"))).extract() for k in loadMySQL(self.conf.get("spider_name")): if fields.get("fields").get(k[2]) != None: item.fields[k[2]] = Field() if fields.get("fields").get(k[2]).keys()[0] == "xpath": item[k[2]] = loop.xpath(u"{}".format(fields.get("fields").get(k[2]).get("xpath"))).extract() elif fields.get("fields").get(k[2]).keys()[0] == "value": item[k[2]] = u"{}".format(fields.get("fields").get(k[2]).get("value")) yield item
def __init__(self, spider_jobid=None, name_spider=None, debug=False, *args, **kwargs): self.spider_jobid = spider_jobid self.name_spider = name_spider self.debug = debug self.conf = api_netspider(name_spider) self.loadconf(self.name_spider, self.spider_jobid, self.conf) self.keys = loadMySQL(self.name_spider)['fields'].keys() super(BBsListSpider, self).__init__(*args, **kwargs)
def __init__(self, spider_jobid=None, name_spider=None, debug=False, *args, **kwargs): self.spider_jobid = spider_jobid self.name_spider = name_spider # self.redis_key = "newsspider:strat_urls" self.debug = debug self.conf = api_netspider(name_spider) self.loadconf(self.name_spider, self.spider_jobid, self.conf) self.keys = loadMySQL(self.name_spider)['fields'].keys() super(NewSpider, self).__init__(*args, **kwargs)
def __init__(self, spider_jobid=None, name_spider=None, debug=False, *args, **kwargs): self.spider_jobid = spider_jobid self.name_spider = name_spider # self.redis_key = "{}:strat_urls".format(name_spider) # logging.info(self.redis_key) self.debug = debug self.conf = api_netspider(name_spider) self.start_urls = self.conf.get("start_urls", "").replace("\r", "").replace("\n", "").split(',') self.loadconf(self.name_spider, self.spider_jobid, self.conf) self.keys = loadMySQL(self.name_spider)['fields'].keys() super(NewSplashSpider, self).__init__(*args, **kwargs)