コード例 #1
0
 def parse(self, response):
     fishtype = re.match('.*q=(.*)&first.*',
                         urllib.parse.unquote(response.url)).group(1)
     tree = fromstring(response.text)
     dictlist = tree.xpath("//div[@class='imgpt']/a/@m")
     imgmetas = [json.loads(x) for x in dictlist]
     sizelist = tree.xpath("//div[@class='imgpt']/a/@style")
     sizemeta = [(x.split(';')[0].split(':')[1].rstrip('px'),
                  x.split(';')[1].split(':')[1].rstrip('px'))
                 for x in sizelist]
     for siz, meta in zip(sizemeta, imgmetas):
         item = FishItem()
         item['Spidername'] = self.Spidername
         item['Spiderinfo'] = self.getSpiderinfo()
         item['fromURL'] = meta['purl']
         #            item['thumbURL'] = imgmeta['thumbURL']
         item['thumbURL'] = "none"  #这个是本地的
         item['objURL'] = meta['murl']
         item['height'] = int(siz[0])
         item['width'] = int(siz[1])
         item['saveURL'] = "none"
         item['type'] = meta['murl'].split('.')[-1]
         item['size'] = 0
         item['name'] = item['objURL'].split('/')[-1]
         item['keyword'] = fishtype
         item['classification'] = fishtype
         item['info'] = ""
         item['count'] = self.count
         self.count += 1
         yield item
コード例 #2
0
ファイル: fishbase.py プロジェクト: muryliang/python_prac
 def imageparse(self, response):
     fishtype = response.meta['type']
     imgtree = fromstring(response.text)
     picurls = imgtree.xpath("//a[@class='tooltip']/span/img/@src")
     fbpicurls = [urljoin(response.url, tmpurl) for tmpurl in picurls]
     spiderinfo = self.getSpiderinfo()
     for imgurl in fbpicurls:
         item = FishItem()
         item['Spidername'] = self.Spidername
         item['Spiderinfo'] = spiderinfo
         item['fromURL'] = response.url
         #            item['thumbURL'] = imgmeta['thumbURL']
         item['thumbURL'] = "none"  #这个是本地的
         item['objURL'] = imgurl
         item['height'] = 0
         item['width'] = 0
         item['size'] = 0
         item['saveURL'] = "none"
         item['type'] = item['objURL'].split('.')[-1]
         item['name'] = item['objURL'].split('/')[-1]
         item['keyword'] = fishtype
         item['classification'] = fishtype
         item['info'] = 'none'
         item['count'] = 0  # not used in tw
         yield item
コード例 #3
0
 def parse(self, response):
     #       fishtype = response.meta['type']
     fishtype = re.match('.*word=(.*)&cg.*',
                         urllib.parse.unquote(response.url)).group(1)
     imgdict = json.loads(response.text)['imgs']
     for imgmeta in imgdict:
         item = FishItem()
         item['Spidername'] = self.Spidername
         item['Spiderinfo'] = self.getSpiderinfo()
         #            item['thumbURL'] = imgmeta['thumbURL']
         item['thumbURL'] = "none"  #这个是本地的
         item['fromURL'] = imgmeta['fromURL']
         item['objURL'] = imgmeta['objURL']
         item['saveURL'] = "none"
         item['width'] = imgmeta['width']
         item['height'] = imgmeta['height']
         item['type'] = imgmeta['type']
         item['size'] = 0
         item['name'] = imgmeta['objURL'].split('/')[-1]
         item['keyword'] = fishtype
         item['classification'] = fishtype
         item['info'] = "currently none"
         item['count'] = self.count
         self.count += 1
         yield item
コード例 #4
0
ファイル: algaebase.py プロジェクト: muryliang/python_prac
 def parse_pictures(self, fromurl, urllist, fishtype):
     print("now in item processing")
     for url in urllist:
         item = FishItem()
         item['Spidername'] = self.Spidername
         item['Spiderinfo'] = self.getSpiderinfo()
         item['fromURL'] = fromurl
         #            item['thumbURL'] = imgmeta['thumbURL']
         item['thumbURL'] = "none"  #这个是本地的
         item['objURL'] = url
         item['saveURL'] = "none"
         item['width'] = 0
         item['height'] = 0
         item['type'] = item['objURL'].split(".")[-1]
         item['size'] = 0
         item['name'] = item['objURL'].split('/')[-1]
         item['keyword'] = fishtype
         item['classification'] = fishtype
         item['info'] = "currently none"
         item['count'] = self.count
         self.count += 1
         yield item
コード例 #5
0
ファイル: baidubak.py プロジェクト: muryliang/python_prac
 def jsonparse(self, response):
     fishtype = response.meta['type']
     imgdict = json.loads(response.text)['imgs']
     for imgmeta in imgdict:
         item = FishItem()
         item['Spidername'] = self.Spidername
         item['Spiderinfo'] = self.getSpiderinfo()
         item['fromURL'] = imgmeta['fromURL']
         #            item['thumbURL'] = imgmeta['thumbURL']
         item['thumbURL'] = "none"  #这个是本地的
         item['fromURL'] = imgmeta['fromURL']
         item['objURL'] = imgmeta['objURL']
         item['saveURL'] = "none"
         item['width'] = imgmeta['width']
         item['height'] = imgmeta['height']
         item['type'] = imgmeta['type']
         item['size'] = "none"
         item['name'] = imgmeta['objURL'].split('/')[-1]
         item['keyword'] = fishtype
         item['classification'] = fishtype
         item['info'] = "currently none"
         yield item
コード例 #6
0
    def process_internal_images(self, response):
        fishtype = response.meta['type']
        infostr = response.meta['infostr']

        intree = fromstring(response.text)
        urllist = intree.xpath("//div[@class='pic']/a/img/@src")
        imgurllist = [urljoin(response.url, parturl) for parturl in urllist]

        #get last image if exists
        try:
            lasturl = intree.xpath("//div[@class='pic']/a[./div]/@href")[0]
            lastimg = re.match('[^=]*=([^&]*)&.*', lasturl).group(1)
            imgurllist.append(lastimg)
        except IndexError as e:
            print("no last img, just continue")

        spiderinfo = self.getSpiderinfo()
        for imgurl in imgurllist:
            item = FishItem()
            item['Spidername'] = self.Spidername
            item['Spiderinfo'] = spiderinfo
            item['fromURL'] = response.url
            #            item['thumbURL'] = imgmeta['thumbURL']
            item['thumbURL'] = "none"  #这个是本地的
            item['objURL'] = imgurl
            item['height'] = 0
            item['width'] = 0
            item['size'] = 0
            item['saveURL'] = "none"
            item['type'] = item['objURL'].split('.')[-1]
            item['name'] = item['objURL'].split('/')[-1]
            item['keyword'] = fishtype
            item['classification'] = fishtype
            item['info'] = infostr
            item['count'] = 0  # not used in tw
            yield item
コード例 #7
0
    def process_external_images(self, response):
        fishtype = response.meta['type']
        infostr = response.meta['infostr']
        fbtree = fromstring(response.text)
        try:
            fbpart = fbtree.xpath("//span[@class='slabel8']/a/@href")[0]
        except IndexError as e:
            print("error when get img page external")
            return None  # is this needed?

        fbimgpage = urljoin(response.url, fbpart)
        imgpage = requests.get(urljoin(response.url, fbpart),
                               headers=self.myheader)
        imgtree = fromstring(imgpage.text)
        picurls = imgtree.xpath("//a[@class='tooltip']/span/img/@src")
        fbpicurls = [urljoin(response.url, tmpurl) for tmpurl in picurls]
        spiderinfo = self.getSpiderinfo()
        for imgurl in fbpicurls:
            item = FishItem()
            item['Spidername'] = self.Spidername
            item['Spiderinfo'] = spiderinfo
            item['fromURL'] = response.url
            #            item['thumbURL'] = imgmeta['thumbURL']
            item['thumbURL'] = "none"  #这个是本地的
            item['objURL'] = imgurl
            item['height'] = 0
            item['width'] = 0
            item['size'] = 0
            item['saveURL'] = "none"
            item['type'] = item['objURL'].split('.')[-1]
            item['name'] = item['objURL'].split('/')[-1]
            item['keyword'] = fishtype
            item['classification'] = fishtype
            item['info'] = infostr
            item['count'] = 0  # not used in tw
            yield item
コード例 #8
0
ファイル: google.py プロジェクト: muryliang/python_prac
 def parse(self, response):
     #       fishtype = response.meta['type']
     fishtype = re.match('.*q=(.*)&ijn.*', urllib.parse.unquote(
         response.url)).group(1).replace("+", " ")
     tree = fromstring(response.text)
     imgmetas = tree.xpath("//div[contains(@class, 'rg_meta')]")
     for imgmeta in imgmetas:
         meta = json.loads(imgmeta.text)
         item = FishItem()
         item['Spidername'] = self.Spidername
         item['Spiderinfo'] = self.getSpiderinfo()
         item['fromURL'] = meta.get('ru', "none")
         #            item['thumbURL'] = imgmeta['thumbURL']
         item['thumbURL'] = "none"  #这个是本地的
         if 'ou' in meta:
             item['objURL'] = meta['ou']
             item['height'] = meta['oh']
             item['width'] = meta['ow']
         elif 'tu' in meta:
             item['objURL'] = meta['tu']
             item['height'] = meta['th']
             item['width'] = meta['tw']
         else:
             item['objURL'] = ""
             item['height'] = 0
             item['width'] = 0
         item['saveURL'] = "none"
         item['type'] = meta.get('ity', 'none')
         item['size'] = 0
         item['name'] = item['objURL'].split('/')[-1]
         item['keyword'] = fishtype
         item['classification'] = fishtype
         item['info'] = meta.get('pt', 'none')
         item['count'] = self.count
         self.count += 1
         yield item