コード例 #1
0
ファイル: sogou.py プロジェクト: colenhyt/tuto
 def __init__(self, category=None, *args, **kwargs):
     self.sitekey = "sogou.com"
     self.temp_id = 0
     self.keysparser = PageKeysParser()
     self.itemsfounder = ItemsFounder()
     self.datamgr = DataMgr()
     self.start_urls = ["http://weixin.sogou.com"]
コード例 #2
0
ファイル: sogou.py プロジェクト: colenhyt/tuto
 def __init__(self, category=None, *args, **kwargs):
     self.sitekey = "sogou.com"
     self.temp_id = 0
     self.keysparser = PageKeysParser();
     self.itemsfounder = ItemsFounder();
     self.datamgr = DataMgr();
     self.start_urls = ["http://weixin.sogou.com"]
コード例 #3
0
ファイル: jd.py プロジェクト: colenhyt/tuto
    def __init__(self, category=None, *args, **kwargs):
        self.sitekey = "jd.com"
        self.temp_id = 0
        self.datamgr = DataMgr()
        self.start_urls = []
        urls = self.datamgr.geturls(urlkey=self.sitekey)
        links = []
        for uitem in urls:
            links.append(uitem[2])
        links = sorted(links, key=lambda d: d[2])
        self.start_urls.extend(links)

        if (len(self.start_urls) <= 0):
            self.start_urls.append("http://www.jd.com")

        print "初始化新url:", len(self.start_urls)

        self.urlsparser = UrlsParser()
        self.urlkeys = ["http://channel.", "http://list.", "http://item."]
        self.urlkeys = [self.sitekey]
        self.ignorekeys = ["#comments-list", "/adclick", "javascript:"]
コード例 #4
0
ファイル: jd.py プロジェクト: colenhyt/tuto
class JdSpider(CrawlSpider):
    name = "jd"
    allowed_domains = ["jd.com"]
    start_urls = ['http://www.jd.com/']

    def __init__(self, category=None, *args, **kwargs):
        self.sitekey = "jd.com"
        self.temp_id = 0
        self.datamgr = DataMgr();
        self.start_urls = []
        urls = self.datamgr.geturls(urlkey=self.sitekey)
        links = []
        for uitem in urls:
          links.append(uitem[2])
        links = sorted(links, key=lambda d: d[2])
        self.start_urls.extend(links)

        if (len(self.start_urls)<=0):
         self.start_urls.append("http://www.jd.com")

        print "初始化新url:",len(self.start_urls)

        self.urlsparser = UrlsParser()
        self.urlkeys = ["http://channel.","http://list.","http://item."]
        self.urlkeys = [self.sitekey]
        self.ignorekeys = ["#comments-list","/adclick","javascript:"]

    def parse(self, response):
        baseurl = response.url
        print "取回:"+baseurl
        self.datamgr.updateurl(baseurl,status=2)

        urlitems = []

        links = self.urlsparser.parse(response.body,baseurl,self.urlkeys,self.ignorekeys,catkeys=self.urlkeys)
        links = sorted(links, key=lambda d: d[2])
        urls = self.datamgr.inserturls(links,relate_url1=baseurl,status=1)
        siteitems = []
        for url in urls:
          if (url[0].find("http://item.")>=0):
            siteitems.append([url[0],url[1],baseurl])
          urlitems.append(Request(url[0],callback=self.parse))

        self.datamgr.insertitems(siteitems)

        return urlitems

    # def parse_urliteml(self, response):
    #   siteitem = []
    #   baseurl = response.url
    #   self.datamgr.updateurl(baseurl,temp_id=self.template[0])
    #
    #   return items
コード例 #5
0
ファイル: jd.py プロジェクト: colenhyt/tuto
class JdSpider(CrawlSpider):
    name = "jd"
    allowed_domains = ["jd.com"]
    start_urls = ['http://www.jd.com/']

    def __init__(self, category=None, *args, **kwargs):
        self.sitekey = "jd.com"
        self.temp_id = 0
        self.datamgr = DataMgr()
        self.start_urls = []
        urls = self.datamgr.geturls(urlkey=self.sitekey)
        links = []
        for uitem in urls:
            links.append(uitem[2])
        links = sorted(links, key=lambda d: d[2])
        self.start_urls.extend(links)

        if (len(self.start_urls) <= 0):
            self.start_urls.append("http://www.jd.com")

        print "初始化新url:", len(self.start_urls)

        self.urlsparser = UrlsParser()
        self.urlkeys = ["http://channel.", "http://list.", "http://item."]
        self.urlkeys = [self.sitekey]
        self.ignorekeys = ["#comments-list", "/adclick", "javascript:"]

    def parse(self, response):
        baseurl = response.url
        print "取回:" + baseurl
        self.datamgr.updateurl(baseurl, status=2)

        urlitems = []

        links = self.urlsparser.parse(response.body,
                                      baseurl,
                                      self.urlkeys,
                                      self.ignorekeys,
                                      catkeys=self.urlkeys)
        links = sorted(links, key=lambda d: d[2])
        urls = self.datamgr.inserturls(links, relate_url1=baseurl, status=1)
        siteitems = []
        for url in urls:
            if (url[0].find("http://item.") >= 0):
                siteitems.append([url[0], url[1], baseurl])
            urlitems.append(Request(url[0], callback=self.parse))

        self.datamgr.insertitems(siteitems)

        return urlitems
コード例 #6
0
ファイル: jd.py プロジェクト: colenhyt/tuto
    def __init__(self, category=None, *args, **kwargs):
        self.sitekey = "jd.com"
        self.temp_id = 0
        self.datamgr = DataMgr();
        self.start_urls = []
        urls = self.datamgr.geturls(urlkey=self.sitekey)
        links = []
        for uitem in urls:
          links.append(uitem[2])
        links = sorted(links, key=lambda d: d[2])
        self.start_urls.extend(links)

        if (len(self.start_urls)<=0):
         self.start_urls.append("http://www.jd.com")

        print "初始化新url:",len(self.start_urls)

        self.urlsparser = UrlsParser()
        self.urlkeys = ["http://channel.","http://list.","http://item."]
        self.urlkeys = [self.sitekey]
        self.ignorekeys = ["#comments-list","/adclick","javascript:"]
コード例 #7
0
ファイル: sogou.py プロジェクト: colenhyt/tuto
class SogouSpider(Spider):
    name = "sogou"
    allowed_domains = ["sogou.com"]

    def __init__(self, category=None, *args, **kwargs):
        self.sitekey = "sogou.com"
        self.temp_id = 0
        self.keysparser = PageKeysParser()
        self.itemsfounder = ItemsFounder()
        self.datamgr = DataMgr()
        self.start_urls = ["http://weixin.sogou.com"]

    def parse(self, response):
        print "连上:" + self.sitekey

        urlitems = []
        #1:爬内容页:
        urls = self.datamgr.geturls(sitekey=self.sitekey)
        for uitem in urls:
            url = uitem[2]
            urlitems.append(Request(url, callback=self.parse_siteurl))

        surlcount = len(urlitems)
        print "装载siteurl数量:", surlcount

        #2: 爬搜索页
        words = self.datamgr.getwords(8)
        for word in words:
            url = "http://weixin.sogou.com/weixin?query=" + word[1]
            urlitems.append(Request(url, callback=self.parse_rooturl))

        print "装载搜索页数量:", len(urlitems) - surlcount

        if (len(urlitems) == 0):
            print "没有任何下载url可爬"

        return urlitems

    def parse_rooturl(self, response):
        baseurl = response.url
        stemplate = self.datamgr.getsitetemplates(self.sitekey)
        nextsiteurls = []
        if (stemplate == None):
            found = self.keysparser.parse(response.body, baseurl)
            if (found):
                temp_id = self.datamgr.insertSiteTemplate(
                    self.sitekey, self.keysparser.itemskeys,
                    self.keysparser.pagingkeys)
                self.template = [
                    temp_id, self.keysparser.itemskeys,
                    self.keysparser.pagingkeys
                ]
                purls = self.datamgr.inserturls(self.keysparser.pagingurls,
                                                temp_id)
                nextsiteurls.extend(purls)
        else:
            self.template = [
                stemplate[0],
                eval(stemplate[2]),
                eval(stemplate[3])
            ]
            keys = [self.template[1], self.template[2]]
            found1, found2 = self.itemsfounder.find(response.body, baseurl,
                                                    keys, self.sitekey)
            if (found2):
                purls = self.datamgr.inserturls(self.itemsfounder.pagingurls,
                                                self.template[0])
                nextsiteurls.extend(purls)

        self.datamgr.updateurl(baseurl, temp_id=self.temp_id)
        itemurls = []
        for url in nextsiteurls:
            print url
            itemurls.append(Request(url, callback=self.parse_siteurl))
        return itemurls

    def parse_siteurl(self, response):
        baseurl = response.url
        self.datamgr.updateurl(baseurl, temp_id=self.template[0])
        keys = [self.template[1], self.template[2]]
        found1, found2 = self.itemsfounder.find(response.body, baseurl, keys,
                                                self.sitekey)
        items = []
        # if (found1):

        return items
コード例 #8
0
ファイル: sogou.py プロジェクト: colenhyt/tuto
class SogouSpider(Spider):
    name = "sogou"
    allowed_domains = ["sogou.com"]

    def __init__(self, category=None, *args, **kwargs):
        self.sitekey = "sogou.com"
        self.temp_id = 0
        self.keysparser = PageKeysParser();
        self.itemsfounder = ItemsFounder();
        self.datamgr = DataMgr();
        self.start_urls = ["http://weixin.sogou.com"]

    def parse(self, response):
        print "连上:"+self.sitekey

        urlitems = []
        #1:爬内容页:
        urls = self.datamgr.geturls(sitekey=self.sitekey)
        for uitem in urls:
          url = uitem[2]
          urlitems.append(Request(url,callback=self.parse_siteurl))

        surlcount = len(urlitems)
        print "装载siteurl数量:",surlcount

        #2: 爬搜索页
        words = self.datamgr.getwords(8)
        for word in words:
            url = "http://weixin.sogou.com/weixin?query="+word[1]
            urlitems.append(Request(url,callback=self.parse_rooturl))

        print "装载搜索页数量:",len(urlitems)-surlcount

        if (len(urlitems)==0):
          print "没有任何下载url可爬"

        return urlitems

    def parse_rooturl(self, response):
      baseurl = response.url
      stemplate = self.datamgr.getsitetemplates(self.sitekey)
      nextsiteurls = []
      if (stemplate==None):
        found = self.keysparser.parse(response.body,baseurl)
        if (found):
          temp_id = self.datamgr.insertSiteTemplate(self.sitekey,self.keysparser.itemskeys,self.keysparser.pagingkeys)
          self.template = [temp_id,self.keysparser.itemskeys,self.keysparser.pagingkeys]
          purls = self.datamgr.inserturls(self.keysparser.pagingurls,temp_id)
          nextsiteurls.extend(purls)
      else:
        self.template = [stemplate[0],eval(stemplate[2]),eval(stemplate[3])]
        keys = [self.template[1],self.template[2]]
        found1,found2 = self.itemsfounder.find(response.body,baseurl,keys,self.sitekey)
        if (found2):
          purls = self.datamgr.inserturls(self.itemsfounder.pagingurls,self.template[0])
          nextsiteurls.extend(purls)

      self.datamgr.updateurl(baseurl,temp_id=self.temp_id)
      itemurls = []
      for url in nextsiteurls:
        print url
        itemurls.append(Request(url,callback=self.parse_siteurl))
      return itemurls

    def parse_siteurl(self, response):
      baseurl = response.url
      self.datamgr.updateurl(baseurl,temp_id=self.template[0])
      keys = [self.template[1],self.template[2]]
      found1,found2 = self.itemsfounder.find(response.body,baseurl,keys,self.sitekey)
      items = []
      # if (found1):

      return items