def grab_360buy_saveToModel(url, id_cate, id_s, localfile): request = urllib2.Request(url=url, headers={'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3' }) response = urllib2.urlopen(request) HTML_response = response.read() soup = BeautifulSoup(HTML_response,from_encoding="gb18030") if soup: tag_div = soup.find_all('div', id = 'plist') if tag_div: tag_item_li = tag_div[0].find_all('li') #myfile = open(localfile,'w') i = 0 # get the default m_cate = models.Category.objects.get(id=id_cate) m_s = models.Seller.objects.get(id=id_s) print m_cate print type(m_s) myfile = open(localfile,'w') for li in tag_item_li: i += 1 #get the tag of each div div = li.find_all('div') if div: print str(i)+'........' p_img = div[0] p_name = div[4] p_price = div[5] #save img url_item = p_img.a['href'] url_img = p_img.img['data-lazyload'] path_dir = os.path.join(os.path.dirname(localfile), 'img') path_img = os.path.join(path_dir , str(i)+'.jpg') saveImg.saveImg(url_img, path_img) #save price url_price = p_price.img['data-lazyload'] path_price = os.path.join(path_dir, str(i)+'_price.jpg') saveImg.saveImg(url_price, path_price) #save to model m_com = models.Commidity(url=url_item, price=0.0, name=str(p_name.a.contents)) m_com.categories = m_cate m_com.seller = m_s m_com.save() print m_com m_p = models.Picture(dir=path_dir,commidity=m_com.id) m_p.save() print m_p #get info myfile.write( str(path_img) + '---') myfile.write( str(p_name.a.contents) + '---') myfile.write( str(p_price.img['data-lazyload']) + '---') myfile.write('\r\n') else: print 'it is empty of div.... f**k' myfile.close() return True
def grab_360buy(url, localfile): print localfile soup = makeSoup(url) if soup: tag_div = soup.find_all('div', id = 'plist') if tag_div: tag_item_li = tag_div[0].find_all('li') myfile = open(localfile,'w') i = 0 for li in tag_item_li: i += 1 #get the tag of each div div = li.find_all('div') if div: print str(i)+'........' p_img = div[0] p_name = div[2] p_price = div[3] print div[0] print '-----------------' print div[2] print '-----------------' print div[3] #save img url_img = p_img.img['data-lazyload'] path_dir = os.path.join(os.path.dirname(localfile), 'img') img_name = getRandom.getRandomStr(5) path_img = os.path.join(path_dir , img_name +'.jpg') saveImg.saveImg(url_img, path_img) #save price if p_price and p_name and p_img: url_price = p_price.img['data-lazyload'] path_price = os.path.join(path_dir, img_name+'_price.jpg') saveImg.saveImg(url_price, path_price) #get info myfile.write( str(path_img) + '---') myfile.write( str(p_name.a.contents) + '---') myfile.write( str(p_price.img['data-lazyload']) + '---') myfile.write('\r\n') else: print 'it is empty of div.... f**k' myfile.close() return True
def getsoup(request): if request.method == 'POST': form = SoupForm(request.POST) if form.is_valid(): clean_data = form.cleaned_data # as below, will grab the data of the url url = clean_data['url'] website = clean_data['website'] # store the url into a file named try.txt #rd = getRandomStr(10) #rd = getRandom.getRandomStr(10) rd = getRandom.getUUID() # os.path.join(os.path.dirname(__file__), 'templates').replace('\\','/'), path_img = os.path.join(os.path.join(os.path.dirname(__file__)), '..\\imgdb\\taobao_' + rd + '.jpg') localfile = os.path.join(os.path.join(os.path.dirname(__file__)), '..\\imgdb\\url_' + rd + '.txt') #getHtml.grabHref(url, localfile) #getResource.grabHref(url, localfile) #getResource.grab_360buy(url, localfile) #getResource.grab_360buy_saveToModel(url, 1, 1, localfile) if website == 'taobao': getResource.grabHref(url, localfile) data = taobao_lib.get_json(url) json_data = json.loads(data) json.loads(data, None) json_item_list = json_data['itemList'] for item in json_item_list: price = item['currentPrice'] name = item['fullTitle'] url = item['storeLink'] img_url = item['image'] #save img saveImg.saveImg(img_url, path_img) elif website == '360buy': getResource.grab_360buy(url, localfile) #print name + price + url + img_url return render_to_response('beautiful_soup.html',{'form': form, 'ans':rd}) else: form = SoupForm(initial={'url':'http://www.baidu.com'}) return render_to_response('beautiful_soup.html',{'form': form})