def getsoup(request): if request.method == 'POST': form = SoupForm(request.POST) if form.is_valid(): clean_data = form.cleaned_data # as below, will grab the data of the url url = clean_data['url'] print url website = '360buy' cate = clean_data['cate'] # store the url into a file named try.txt #rd = getRandomStr(10) #rd = getRandom.getRandomStr(10) rd = getRandom.getUUID() #path_img = os.path.join(settings.GRAB_IMG_ROOT, rd) # os.path.join(os.path.dirname(__file__), 'templates').replace('\\','/'), #path_img = os.path.join(os.path.join(os.path.dirname(__file__)), '..\\imgdb\\taobao_' + rd + '.jpg') #localfile = os.path.join(os.path.join(os.path.dirname(__file__)), '..\\imgdb\\url_' + rd + '.txt') #getHtml.grabHref(url, localfile) #getResource.grabHref(url, localfile) #getResource.grab_360buy(url, localfile) #getResource.grab_360buy_saveToModel(url, 1, 1, localfile) if website == 'taobao': getResource.grabHref(url, localfile) data = taobao_lib.get_json(url) json_data = json.loads(data) json.loads(data, None) json_item_list = json_data['itemList'] for item in json_item_list: price = item['currentPrice'] name = item['fullTitle'] url = item['storeLink'] img_url = item['image'] #save img saveImg.saveImg(img_url, path_img) elif website == '360buy': #debug print settings.MEDIA_ROOT target_dir = settings.MEDIA_ROOT + 'jd360/' img_root = target_dir + time.strftime('%Y%m%d') #now = time.strftime('%H%M%S') if not os.path.exists(img_root): os.mkdir(img_root) # make directory # img_root = os.path.join(settings.MEDIA_ROOT, 'jd360/') #create the path #os.mkdir(img_root) # getResource.grab_360buy(url, img_root) #row = re.findall("\w+",url); #for url_li in row: # print 'url_li:???????????'+url_li getResource.grab_360buy_bag_m(url, img_root) print 'img_root-----------' print img_root #print name + price + url + img_url return render_to_response('beautiful_soup.html',{'form': form, 'ans':img_root}) else: form = SoupForm(initial={'url':'http://list.jd.com/1672-2576-5262.html'}) return render_to_response('beautiful_soup.html',{'form': form})
def grab_360buy_saveToModel(url, id_cate, id_s, localfile): request = urllib2.Request(url=url, headers={'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3' }) response = urllib2.urlopen(request) HTML_response = response.read() soup = BeautifulSoup(HTML_response,from_encoding="gb18030") if soup: tag_div = soup.find_all('div', id = 'plist') if tag_div: tag_item_li = tag_div[0].find_all('li') #myfile = open(localfile,'w') i = 0 # get the default m_cate = models.Category.objects.get(id=id_cate) m_s = models.Seller.objects.get(id=id_s) print m_cate print type(m_s) myfile = open(localfile,'w') for li in tag_item_li: i += 1 #get the tag of each div div = li.find_all('div') if div: print str(i)+'........' p_img = div[0] p_name = div[4] p_price = div[5] #save img url_item = p_img.a['href'] url_img = p_img.img['data-lazyload'] path_dir = os.path.join(os.path.dirname(localfile)) path_img = os.path.join(path_dir , str(i)+'.jpg') saveImg.saveImg(url_img, path_img) ##save price #url_price = p_price.img['data-lazyload'] #path_price = os.path.join(path_dir, str(i)+'_price.jpg') #saveImg.saveImg(url_price, path_price) #save to model m_com = models.Commidity(url=url_item, price=0.0, name=str(p_name.a.contents)) m_com.categories = m_cate m_com.seller = m_s m_com.save() print m_com m_p = models.Picture(dir=path_dir,commidity=m_com.id) m_p.save() print m_p #get info myfile.write( str(path_img) + '---') myfile.write( str(p_name.a.contents) + '---') myfile.write( str(p_price.img['data-lazyload']) + '---') myfile.write('\r\n') else: print 'it is empty of div.... f**k' myfile.close() return True
def grab_360buy(url, localfile): #print localfile soup = makeSoup(url) if soup: tag_div = soup.find_all('div', id = 'plist') if tag_div: tag_item_li = tag_div[0].find_all('li') #myfile = open(localfile,'w') i = 0 for li in tag_item_li: i += 1 #get the tag of each div div = li.find_all('div') if div: #print str(i)+'........' p_img = div[0] p_name = div[2] p_price = div[3] # the url of the item #url_img = p_img.img['data-lazyload'] #debug print '-------------------------------' print p_img if p_img.img: url_img = p_img.img['src'] url_item = p_img.img['alt'] path_dir = os.path.join(os.path.dirname(localfile)) img_name = getRandom.getRandomStr(5) #path_img = os.path.join(path_dir , img_name +'.jpg') path_img = os.path.join(path_dir , str(i) +'.jpg') #debug #print url_img #print path_img saveImg.saveImg(url_img, path_img) ##save price #if p_price and p_name and p_img: #url_price = p_price.img['data-lazyload'] #path_price = os.path.join(path_dir, img_name+'_price.jpg') #saveImg.saveImg(url_price, path_price) ##get info #myfile.write( str(path_img) + '---') #myfile.write( str(p_name.a.contents) + '---') #myfile.write( str(p_price.img['data-lazyload']) + '---') #myfile.write('\r\n') else: print 'it is empty of div.... f**k' #myfile.close() return True
def grab_360buy_bag_m(url, localfile): soup = makeSoup(url) if soup: tag_div = soup.find_all('div', id = 'plist') if tag_div: tag_div_a = tag_div[0].find_all('a', target='_blank') #data = [] i=0 path_dir = os.path.join(os.path.dirname(localfile)) print len(tag_div_a) for a in tag_div_a: img = a.find('img') if not img : continue print '>>>>>>>>>>>>>>>>>>>>'+str(i) url_item = a['href'] url_img = a.find('img').get('src') url_img2 = a.find('img').get('src2') desc = a.find('img')['alt'] #data.append([url_item,url_img,desc]) if url_img and url_img2: continue elif not url_img: url_img = url_img2 #print desc #print url_item #print url_img #print img #print '----------------------------' i += 1 #img_name = getRandom.getRandomStr(5) path_img = os.path.join(path_dir , str(i) +'.jpg') #path_img = os.path.join(path_dir , str(i++) +'.jpg') saveImg.saveImg(url_img, path_img) #save to db #saveDB.saveToImagedata(cid, comid, price, desc, url, localfile, gender) saveDB.saveToImagedata(1, 0, desc, url_item, path_img, '1') else: print "It's empty!!!!! fuque......." return True
def getsoup(request): if request.method == 'POST': form = SoupForm(request.POST) if form.is_valid(): clean_data = form.cleaned_data # as below, will grab the data of the url url = clean_data['url'] print url website = '360buy' cate = clean_data['cate'] # store the url into a file named try.txt #rd = getRandomStr(10) #rd = getRandom.getRandomStr(10) rd = getRandom.getUUID() #path_img = os.path.join(settings.GRAB_IMG_ROOT, rd) # os.path.join(os.path.dirname(__file__), 'templates').replace('\\','/'), #path_img = os.path.join(os.path.join(os.path.dirname(__file__)), '..\\imgdb\\taobao_' + rd + '.jpg') #localfile = os.path.join(os.path.join(os.path.dirname(__file__)), '..\\imgdb\\url_' + rd + '.txt') #getHtml.grabHref(url, localfile) #getResource.grabHref(url, localfile) #getResource.grab_360buy(url, localfile) #getResource.grab_360buy_saveToModel(url, 1, 1, localfile) if website == 'taobao': getResource.grabHref(url, localfile) data = taobao_lib.get_json(url) json_data = json.loads(data) json.loads(data, None) json_item_list = json_data['itemList'] for item in json_item_list: price = item['currentPrice'] name = item['fullTitle'] url = item['storeLink'] img_url = item['image'] #save img saveImg.saveImg(img_url, path_img) elif website == '360buy': #debug print settings.MEDIA_ROOT target_dir = settings.MEDIA_ROOT + 'jd360/' img_root = target_dir + time.strftime('%Y%m%d') #now = time.strftime('%H%M%S') if not os.path.exists(img_root): os.mkdir(img_root) # make directory # img_root = os.path.join(settings.MEDIA_ROOT, 'jd360/') #create the path #os.mkdir(img_root) # getResource.grab_360buy(url, img_root) #row = re.findall("\w+",url); #for url_li in row: # print 'url_li:???????????'+url_li getResource.grab_360buy_bag_m(url, img_root) print 'img_root-----------' print img_root #print name + price + url + img_url return render_to_response('beautiful_soup.html', { 'form': form, 'ans': img_root }) else: form = SoupForm( initial={'url': 'http://list.jd.com/1672-2576-5262.html'}) return render_to_response('beautiful_soup.html', {'form': form})