import tools, time from PIL import Image import re from urllib import urlretrieve def get_gogh_urls(): urlstring = "http://www.vangoghgallery.com/catalog/Painting/" html = tools.my_url_open(urlstring).read() lst = re.findall('catalog/Painting/.*?html', html) return lst lst = tools.uniqueify(get_gogh_urls()) lst2 = [i[17:] for i in lst] ref_urls = [ "http://www.vangoghgallery.com/catalog/Painting/" + i for i in lst2 ] img_urls = [] for i in ref_urls: html = tools.my_url_open(i).read() s = re.findall("image/.*?/.*?.jpg", html) assert len(s) == 2 and s[0] == s[1] print i img_urls.append("http://www.vangoghgallery.com/catalog/" + s[0]) #lst2 = [i[6:-1] for i in lst] #lst4 = [] #for i in lst2: # html = tools.my_url_open("http://flower-dictionary.com/"+i).read()
print "YO7" continue res['att_price'] = float(nxt[aaa:bbb].replace(",","")) results.append(res) except: pass return results for p in range(326,328): allresults = [] url = urlstring.replace("pG",str(p)) #print url html = tools.my_url_open(url).read() allresults=scrape_amazon_page(html) print "results page",p, len(allresults), len(allresults)/24., len(tools.uniqueify(allresults)) i=0 ar2 = [] for r in allresults: i=i+1 imfname = "c:/temp/tiles2/"+r['ASIN']+".jpg" if not os.path.exists(imfname): try: print "Scraping object",i,"page",p,(p-1)*24+i html = tools.my_url_open(r['url']).read() r['att_glass']=1 if 'Glass Tiles</a>' in html else 0 r['att_ceramic']=1 if 'Ceramic Tiles</a>' in html else 0 r['att_stone']=1 if 'Stone Tiles</a>' in html else 0 r['att_limestone']=1 if 'Limestone Tiles</a>' in html else 0 r['att_marble']=1 if 'Marble Tiles</a>' in html else 0
import tools, time from PIL import Image import re from urllib import urlretrieve def get_gogh_urls(): urlstring = "http://www.vangoghgallery.com/catalog/Painting/" html = tools.my_url_open(urlstring).read() lst= re.findall('catalog/Painting/.*?html',html) return lst lst=tools.uniqueify(get_gogh_urls()) lst2=[i[17:] for i in lst] ref_urls = ["http://www.vangoghgallery.com/catalog/Painting/"+i for i in lst2] img_urls = [] for i in ref_urls: html = tools.my_url_open(i).read() s = re.findall("image/.*?/.*?.jpg",html) assert len(s)==2 and s[0]==s[1] print i img_urls.append("http://www.vangoghgallery.com/catalog/"+s[0]) #lst2 = [i[6:-1] for i in lst] #lst4 = [] #for i in lst2: # html = tools.my_url_open("http://flower-dictionary.com/"+i).read() # lst3=re.findall("/uploads/flowers/.*?.jpg",html) # if len(lst3)!=1:
continue res['att_price'] = float(nxt[aaa:bbb].replace(",", "")) results.append(res) except: pass return results for p in range(326, 328): allresults = [] url = urlstring.replace("pG", str(p)) #print url html = tools.my_url_open(url).read() allresults = scrape_amazon_page(html) print "results page", p, len(allresults), len(allresults) / 24., len( tools.uniqueify(allresults)) i = 0 ar2 = [] for r in allresults: i = i + 1 imfname = "c:/temp/tiles2/" + r['ASIN'] + ".jpg" if not os.path.exists(imfname): try: print "Scraping object", i, "page", p, (p - 1) * 24 + i html = tools.my_url_open(r['url']).read() r['att_glass'] = 1 if 'Glass Tiles</a>' in html else 0 r['att_ceramic'] = 1 if 'Ceramic Tiles</a>' in html else 0 r['att_stone'] = 1 if 'Stone Tiles</a>' in html else 0 r['att_limestone'] = 1 if 'Limestone Tiles</a>' in html else 0 r['att_marble'] = 1 if 'Marble Tiles</a>' in html else 0