Exemple #1
0
import tools, time
from PIL import Image
import re
from urllib import urlretrieve


def get_gogh_urls():

    urlstring = "http://www.vangoghgallery.com/catalog/Painting/"
    html = tools.my_url_open(urlstring).read()
    lst = re.findall('catalog/Painting/.*?html', html)
    return lst


lst = tools.uniqueify(get_gogh_urls())
lst2 = [i[17:] for i in lst]
ref_urls = [
    "http://www.vangoghgallery.com/catalog/Painting/" + i for i in lst2
]
img_urls = []
for i in ref_urls:
    html = tools.my_url_open(i).read()
    s = re.findall("image/.*?/.*?.jpg", html)
    assert len(s) == 2 and s[0] == s[1]
    print i
    img_urls.append("http://www.vangoghgallery.com/catalog/" + s[0])

#lst2 = [i[6:-1] for i in lst]
#lst4 = []
#for i in lst2:
#    html = tools.my_url_open("http://flower-dictionary.com/"+i).read()
Exemple #2
0
                print "YO7"
                continue
            res['att_price'] = float(nxt[aaa:bbb].replace(",",""))
            results.append(res)
        except:
            pass
    return results


for p in range(326,328): 
    allresults = []
    url = urlstring.replace("pG",str(p))
    #print url
    html = tools.my_url_open(url).read()
    allresults=scrape_amazon_page(html)
    print "results page",p, len(allresults), len(allresults)/24., len(tools.uniqueify(allresults))
  
    i=0
    ar2 = []
    for r in allresults:
        i=i+1
        imfname = "c:/temp/tiles2/"+r['ASIN']+".jpg"
        if not os.path.exists(imfname):
            try:
                print "Scraping object",i,"page",p,(p-1)*24+i
                html = tools.my_url_open(r['url']).read()
                r['att_glass']=1 if 'Glass Tiles</a>' in html else 0
                r['att_ceramic']=1 if 'Ceramic Tiles</a>' in html else 0
                r['att_stone']=1 if 'Stone Tiles</a>' in html else 0
                r['att_limestone']=1 if 'Limestone Tiles</a>' in html else 0
                r['att_marble']=1 if 'Marble Tiles</a>' in html else 0
Exemple #3
0
import tools, time
from PIL import Image
import re
from urllib import urlretrieve

def get_gogh_urls():

    urlstring = "http://www.vangoghgallery.com/catalog/Painting/"
    html = tools.my_url_open(urlstring).read()
    lst= re.findall('catalog/Painting/.*?html',html)
    return lst
    
lst=tools.uniqueify(get_gogh_urls())
lst2=[i[17:] for i in lst]
ref_urls = ["http://www.vangoghgallery.com/catalog/Painting/"+i for i in lst2]
img_urls = []
for i in ref_urls:
    html = tools.my_url_open(i).read()
    s = re.findall("image/.*?/.*?.jpg",html)
    assert len(s)==2 and s[0]==s[1]
    print i
    img_urls.append("http://www.vangoghgallery.com/catalog/"+s[0])



#lst2 = [i[6:-1] for i in lst]
#lst4 = []
#for i in lst2:
#    html = tools.my_url_open("http://flower-dictionary.com/"+i).read()
#    lst3=re.findall("/uploads/flowers/.*?.jpg",html)    
#    if len(lst3)!=1:
Exemple #4
0
                continue
            res['att_price'] = float(nxt[aaa:bbb].replace(",", ""))
            results.append(res)
        except:
            pass
    return results


for p in range(326, 328):
    allresults = []
    url = urlstring.replace("pG", str(p))
    #print url
    html = tools.my_url_open(url).read()
    allresults = scrape_amazon_page(html)
    print "results page", p, len(allresults), len(allresults) / 24., len(
        tools.uniqueify(allresults))

    i = 0
    ar2 = []
    for r in allresults:
        i = i + 1
        imfname = "c:/temp/tiles2/" + r['ASIN'] + ".jpg"
        if not os.path.exists(imfname):
            try:
                print "Scraping object", i, "page", p, (p - 1) * 24 + i
                html = tools.my_url_open(r['url']).read()
                r['att_glass'] = 1 if 'Glass Tiles</a>' in html else 0
                r['att_ceramic'] = 1 if 'Ceramic Tiles</a>' in html else 0
                r['att_stone'] = 1 if 'Stone Tiles</a>' in html else 0
                r['att_limestone'] = 1 if 'Limestone Tiles</a>' in html else 0
                r['att_marble'] = 1 if 'Marble Tiles</a>' in html else 0