def downloadImg(id,url): global WHERE global SQLS WHERE+=1 #print WHERE,"条记录以入库" url="http://www.google.com/searchbyimage?image_url="+url print url res=Gzzip.newOpen(url) r=res.read() #print r #print res.headers pattern=r'initialize\((.*?)\)' url=re.findall(pattern,r,re.MULTILINE|re.S) url=url[0].replace('/search','http://www.google.com/search').replace('\\x','%') #print url url=urllib2.unquote(url).replace('&','&') #delete '' url=url[1:-1] r=Gzzip.newOpen(url).read() pattern=r'italic">(.*?)</a' title=re.findall(pattern,r,re.MULTILINE|re.S) if len(title)==0: title=['no title'] SQLS="UPDATE `spider` SET `is_named`=1,`title`='"+re.escape(title[0])+"' WHERE `id`="+str(id)+";" try: print SQLS res=cursor.execute(SQLS) except Exception: pass f.write(SQLS)
def pageContent(url): r=Gzzip.newOpen("http://www.imgspark.com"+url) src=[] #src pattern=r'id="lrg_image" src="(.*?)"' src.append(re.findall(pattern,r,re.MULTILINE|re.S)[0]) #location pattern=r'<span id="source_content"><a href="(.*?)"' try: src.append(re.findall(pattern,r,re.MULTILINE|re.S)[0]) except Exception: if len(src)==1: src.append('None') #tags pattern=r'<ul class="list_tags_horizontal">(.*?)</ul>' res=re.findall(pattern,r,re.MULTILINE|re.S) pattern=r'title="(.*?)"' tags=re.findall(pattern,res[0],re.MULTILINE|re.S) src.append(tags) return src
def listhref(id): url="http://www.imgspark.com/image/popular/all/alltime/"+str(id)+"/" r=Gzzip.newOpen(url) pattern=r'<div class="image_wrap">\s+<a href="(.*?)"(?:.*?)\s+</div>' listHrefs=re.findall(pattern,r,re.MULTILINE|re.S) return listHrefs