Example #1
0
 def parse_item(self, response):
     item = Item()
     word = response.meta['word']
     fields = json.loads(self.conf.get("fields"))
     l = ItemLoader(item, response)
     if fields.get("fields", "") == "":
         logging.error(u"内容解析未得到!!!")
         return l.load_item()
     item.fields["url"] = Field()
     item.fields["spider_jobid"] = Field()
     l.add_value("url", response.url)
     l.add_value("spider_jobid", self.spider_jobid)
     item.fields['word'] = Field()
     l.add_value('word', word)
     # 加载动态库字段建立Field,xpath规则 (方法一)
     for k in loadMySQL(self.name_spider)['fields'].keys():
         if fields.get("fields", "") == "":
             logging.error(u"内容解析未得到!!!")
             return l.load_item()
         if fields.get("fields").get(k) != None:
             item.fields[k] = Field()
             if fields.get("fields").get(k).keys()[0] == "xpath":
                 l.add_xpath(
                     k,
                     u"{}".format(fields.get("fields").get(k).get("xpath")),
                     MapCompose(unicode.strip))
             elif fields.get("fields").get(k).keys()[0] == "value":
                 l.add_value(
                     k,
                     u"{}".format(fields.get("fields").get(k).get("value")))
     return l.load_item()
 def parse(self, response):
     item = Item()
     sel = Selector(response)
     fields = json.loads(self.conf.get("fields"))
     rules = json.loads(self.conf.get("rules"))
     loops = rules.get("rules").get("rules_listxpath")
     if fields.get("fields", "") == "":
         logging.error(u"内容解析未得到!!!")
         yield item
     item.fields["url"] = Field()
     item.fields["spider_jobid"] = Field()
     item["spider_jobid"] = self.spider_jobid
     item.fields['word'] = Field()
     item['word'] = response.meta.get("word")
     # 加载动态库字段建立Field,xpath规则 (方法一)
     for loop in sel.xpath("{}".format(loops)):
         item['url'] = loop.xpath(u"{}".format(fields.get("fields").get("url").get("xpath"))).extract()
         for k in loadMySQL(self.conf.get("spider_name")):
             if fields.get("fields").get(k[2]) != None:
                 item.fields[k[2]] = Field()
                 if fields.get("fields").get(k[2]).keys()[0] == "xpath":
                     item[k[2]] = loop.xpath(u"{}".format(fields.get("fields").get(k[2]).get("xpath"))).extract()
                 elif fields.get("fields").get(k[2]).keys()[0] == "value":
                     item[k[2]] = u"{}".format(fields.get("fields").get(k[2]).get("value"))
         yield item
 def __init__(self,
              spider_jobid=None,
              name_spider=None,
              debug=False,
              *args,
              **kwargs):
     self.spider_jobid = spider_jobid
     self.name_spider = name_spider
     self.debug = debug
     self.conf = api_netspider(name_spider)
     self.loadconf(self.name_spider, self.spider_jobid, self.conf)
     self.keys = loadMySQL(self.name_spider)['fields'].keys()
     super(BBsListSpider, self).__init__(*args, **kwargs)
Example #4
0
 def __init__(self,
              spider_jobid=None,
              name_spider=None,
              debug=False,
              *args,
              **kwargs):
     self.spider_jobid = spider_jobid
     self.name_spider = name_spider
     # self.redis_key = "newsspider:strat_urls"
     self.debug = debug
     self.conf = api_netspider(name_spider)
     self.loadconf(self.name_spider, self.spider_jobid, self.conf)
     self.keys = loadMySQL(self.name_spider)['fields'].keys()
     super(NewSpider, self).__init__(*args, **kwargs)
Example #5
0
 def __init__(self,
              spider_jobid=None,
              name_spider=None,
              debug=False,
              *args,
              **kwargs):
     self.spider_jobid = spider_jobid
     self.name_spider = name_spider
     # self.redis_key = "{}:strat_urls".format(name_spider)
     # logging.info(self.redis_key)
     self.debug = debug
     self.conf = api_netspider(name_spider)
     self.start_urls = self.conf.get("start_urls",
                                     "").replace("\r",
                                                 "").replace("\n",
                                                             "").split(',')
     self.loadconf(self.name_spider, self.spider_jobid, self.conf)
     self.keys = loadMySQL(self.name_spider)['fields'].keys()
     super(NewSplashSpider, self).__init__(*args, **kwargs)