Example #1
0
File: title.py Project: h2ero/note
def downloadImg(id,url):
    global WHERE
    global SQLS
    WHERE+=1
    #print WHERE,"条记录以入库"
    url="http://www.google.com/searchbyimage?image_url="+url
    print url
    res=Gzzip.newOpen(url)
    r=res.read()
    #print r
    #print res.headers
    pattern=r'initialize\((.*?)\)'
    url=re.findall(pattern,r,re.MULTILINE|re.S)
    url=url[0].replace('/search','http://www.google.com/search').replace('\\x','%')
    #print url
    url=urllib2.unquote(url).replace('&','&')
    #delete ''
    url=url[1:-1]
    r=Gzzip.newOpen(url).read()
    pattern=r'italic">(.*?)</a'
    title=re.findall(pattern,r,re.MULTILINE|re.S)
    if len(title)==0:
        title=['no title']
    SQLS="UPDATE `spider` SET `is_named`=1,`title`='"+re.escape(title[0])+"' WHERE `id`="+str(id)+";"
    try:
        print SQLS
        res=cursor.execute(SQLS)
    except Exception:
        pass
    f.write(SQLS)
Example #2
0
File: spider.py Project: h2ero/note
def pageContent(url):
	r=Gzzip.newOpen("http://www.imgspark.com"+url)
	src=[]
	#src
	pattern=r'id="lrg_image" src="(.*?)"'
	src.append(re.findall(pattern,r,re.MULTILINE|re.S)[0])
	#location
	pattern=r'<span id="source_content"><a href="(.*?)"'
	try:
		src.append(re.findall(pattern,r,re.MULTILINE|re.S)[0])
	except Exception:
            if len(src)==1:
                 src.append('None')
	#tags
	pattern=r'<ul class="list_tags_horizontal">(.*?)</ul>'
	res=re.findall(pattern,r,re.MULTILINE|re.S)
	pattern=r'title="(.*?)"'
	tags=re.findall(pattern,res[0],re.MULTILINE|re.S)
	src.append(tags)
	return src
Example #3
0
File: spider.py Project: h2ero/note
def listhref(id):
    url="http://www.imgspark.com/image/popular/all/alltime/"+str(id)+"/"
    r=Gzzip.newOpen(url)
    pattern=r'<div class="image_wrap">\s+<a href="(.*?)"(?:.*?)\s+</div>'
    listHrefs=re.findall(pattern,r,re.MULTILINE|re.S)
    return listHrefs