def get_list(url): list_num ='' html=get_html.get_html(url) list_info =re.search(r"<div class='result-summary-container'>(.*?)</div>",html,re.S) if list_info!=None: list_info=list_info.group(1) list_num=re.search(r'of[\s]([0-9]?\,?[0-9]+)[\s]results', list_info, re.S) if list_num: list_num = list_num.group(1) list_num = list_num.replace(',', '') return int(list_num) else: return 0 else: list_info =re.search(r"<div class=result-summary-container>(.*?)</div>",html,re.S) if list_info!=None: list_info=list_info.group(1) list_num=re.search(r'of[\s]([0-9]?\,?[0-9]+)[\s]results', list_info, re.S) if list_num: list_num = list_num.group(1) list_num = list_num.replace(',', '') return int(list_num) else: return 0 else: return 0
def handle(list_url): try: url_info = list_url.split("\t") url = url_info[-1].replace("\n", "") list_num = url_info[0] # http://www.walmart.com/browse/books/top-200-books/1229749_1044270?page=2&cat_id=1229749_1044270&facet=retailer:Walmart.com&min_price=0&max_price=5 items_url_info = url.split("?") if int(list_num) > 1000: list_num = 1000 if int(list_num) % 40 > 0: page_num = int(list_num) / 40 + 1 else: page_num = int(list_num) / 40 for page in range(1, page_num + 1): items_url = items_url_info[0] + '?page=' + str( page) + '&' + items_url_info[1] html = get_html.get_html(items_url) itemsId_info = re.search(r'"displayOrder":\[(.*?)\]', html, re.S) itemsId_all = itemsId_info.group(1).replace('"', '') items_list = itemsId_all.split(',') for itemsId in items_list: result_file.write(itemsId + '\n') result_file.flush() print(itemsId) except Exception as e: print(e) pass
def get_all_url(page_num, base_url): for i in range(1, page_num + 1): url = base_url url += str(i) print(url) html = get_html.get_html(url) if html: ff = open('./html/' + str(i), 'w') ff.write(html) print('write html succeed') else: print('write html error')
def get_json(info): driver.implicitly_wait(8) try: info_url = 'http' + info.split('http')[1] basic_info = info.split('http')[0] driver.get(info_url) html = driver.page_source # html = get_html.get_html(url) formor_url_list = re.findall( r'<div id="mod-detail-dealrecord"(.*?)"isTgcSKUOffer"', html, re.S) formor = str(formor_url_list).split('"remarkListUrl":"')[1] formor_url = formor.split('","')[0] print(formor_url) memberId_list = re.findall( r'<input type="hidden" id="feedbackUid"(.*?)/>', html, re.S) memberId = str(memberId_list).split('value="')[1] memberId = memberId.split('"')[0] memberId_list_1 = re.findall(r'var WolfSmoke={(.*?)}', html, re.S) memberId_1 = str(memberId_list_1).split('member_id:"')[1] memberId_1 = memberId_1.split('"')[0] print(memberId_1) json_url = formor_url + '¤tPage=1&memberId=' + memberId print(json_url) data_prime = get_html.get_html(json_url) # req = urllib2.Request(json_url) # page = urllib2.urlopen(req, timeout=10) # data_prime = page.read() print(data_prime) finally_file = [] offerSaleRecordStat = data_prime.split('offerSaleRecordStat":')[1] offerSaleRecordStat = offerSaleRecordStat.split(',"currentPage"')[0] #{"repeatBuyCount":6.5,"buyerTotal":123,"saleRecordTotal":594,"oneRecordRateCountHidden":40,"avgBuyCount":4,"oneRecordRate":6.73},"currentPage":1,"totalPage":29,"defaultShowWithContent":false} numlist = re.findall(r'":(.*?),"', str(offerSaleRecordStat), re.S) print(numlist) finally_file.append(basic_info) finally_file.append(numlist) lock.acquire() result_file.write("\t".join(finally_file) + "\n") result_file.flush() lock.release() driver.implicitly_wait(5) driver.close() except: pass
def get_items_html(items_file): global lock,pool,items_info title_list = ['itemsId', 'price', 'ship', 'stock', 'brand', 'title', 'img1', 'img2', 'img3', 'detail1', 'detail2', 'detail3', 'detail4', 'detail5','Specification'] file_name = './Hidden Cameras.xls' items_info = open(file_name, 'aw') items_info.write('\t'.join(title_list) + '\n') items_info.flush() items_list = open(items_file,'r').readlines() for it in items_list: print (it) html = get_html.get_html(it) # time.sleep(2.5) get_info(html) items_info.close()
def handle(items): # sign = str(items).split('>')[0] url = 'https://www.amazom.com/dp/' + items print(url) try: #商品详情页 #获取每一个商品页面的html html = get_html.get_html(url) if html: #调用get_info函数,传入html print(html) else: print('error') except Exception as e: with open('./Result/asin_is_exist/error.txt', 'a') as fail_url: fail_url.write(items + e + '\n')
def get_bookurl_by_cateurl(url): try: html = get_html.get_html(url) if html and -1 == html.find("Security Measure"): pages = re.findall('<b>Page 1</b> of (.*?)<br>', html) if pages: pages = int(pages[0].replace(",", "")) book_urls = re.findall( '<div style="float:left;" itemscope="itemscope" itemtype="http://schema.org/SearchResultsPage"><b><a href="(.*?)">', html, re.S) for book_url in book_urls: lock.acquire() result_file.write(book_url + '\n') result_file.flush() lock.release() url = url + 'QQpgZ' for page in range(2, pages): bookurl = url + str(page) print(bookurl) while True: html = get_html(bookurl) if html: real_page = re.findall('<b>Page (.*?)</b> of', html) if real_page: if str(page) == real_page[0]: book_urls = re.findall( '<div style="float:left;" itemscope="itemscope" itemtype="http://schema.org/SearchResultsPage"><b><a href="(.*?)">', html, re.S) for book_url in book_urls: lock.acquire() result_file.write(book_url + '\n') result_file.flush() lock.release() break elif -1 != html.find("Try again..."): break else: lock.acquire() not_crawl_file.write(url + "\n") not_crawl_file.flush() lock.release() except Exception as e: print(e) logger2.error(str(e))
def get_newegg_url(page_num, new_url): for i in range(1, page_num + 1): url = new_url url += str(i) print(url) html = get_html.get_html(url) result_file = open('./items_new.txt', 'aw') if html: # <input id="CompareItem_9SIA97A4TG0647" autocomplete="off" neg-itemnumber="9SIA97A4TG0647" type="checkbox" # name="CompareItem" value="CompareItem_9SIA97A4TG0647"> # 取后面的字符串,以列表形式保存 items = re.findall(r'value="CompareItem_(.*?)"', html) print(len(items)) for item in items: # items.txt里保存的是截取的字段 result_file.write(item + '\n') else: print('write html error')
def get_result(itemsId, i): true = True null = None false = False key_list = [ 'asfas', '', 'z2pqv4dtuwhxe3hkesx9kqpv', 'fmwnnrwf53d6c5sw7b4pu2q3', 'jqpyjz92jmaruene4mpbe8pc', 'nz2gzu5byp9dbnm6jee69jkp', 'sfpw74s5yte8dj9r9atzyc5m', '2tk5sghn56mnth5uabspkdt6', 'favxs8n4kvmrtebrn6uymjdy', 'vffn6p33smtby3tytugp8zjt', 'ctztjz3gfm273husdu7apwwh', 'fwx3gf2qdqhx782h9espqajz', 'f7fqv4jzcdr7ccfb2b339cv9', '7sd4rpjfmdurwuwzgvpbffd2' ] key = key_list[i] items_url = 'http://api.walmartlabs.com/v1/items/' + str( itemsId) + '?apiKey=' + key + '&format=json' print(items_url) info = get_html.get_html(items_url) items_info = eval(info) return items_info
def get_asin(base_url, page): #Electronics : Computers & Accessories : Monitors : Prime Eligible : New for i in range(0, page): # 页码 start_num = i * 25 url = base_url.replace("[page]", str(i)).replace('[start]', str(start_num)) print(url) # time.sleep(2) html = get_html.get_html(url) url_list_re = re.findall(r'<td colspan="2">(.*?)</td>', html, re.S) print(url_list_re) url_list = re.findall(r'<A HREF="(.*?)">', str(url_list_re), re.S) print(url_list) print(len(url_list)) for goods_url in url_list: with open("./Result/items_url.txt", "aw") as f: f.write('http://www.frys.com/' + goods_url + "\n") print(goods_url)
def handle(itemsId): try: #删除itemsId开头结尾处的空格 itemsId = itemsId.strip() #商品详情页 url = 'http://www.newegg.com/Product/Product.aspx?Item=' + itemsId #获取每一个商品页面的html html = get_html.get_html(url) if html: #调用get_info函数,传入html和每个商品的id get_info(html, itemsId) else: with open('./get_html_fail.txt', 'aw') as h: h.write(itemsId + '\n') except Exception as e: print(itemsId, ":", e) with open('./except.txt', 'aw') as f: f.write(itemsId + '\n')
def get_asin(): #Electronics : Computers & Accessories : Monitors : Prime Eligible : New base_url = '''https://www.amazon.com/s/ref=sr_pg_[i]?fst=as%3Aoff&rh=n%3A172282%2Cn%3A!493964%2Cn%3A172541%2Cn%3A12097478011%2Cp_85%3A2470955011%2Cp_n_condition-type%3A2224371011&page=[i]&bbn=12097478011&ie=UTF8&qid=1479085629''' for i in range(1, 222): # 页码,共2页 url = base_url.replace("[i]", str(i)) print(url) time.sleep(2) html = get_html.get_html(url) url_list = re.findall( r'<a class="a-link-normal s-access-detail-page .*? href="(.*?)">', html, re.S) print(len(url_list)) for goods_url in url_list: with open("./Result/items_url.txt", "aw") as f: f.write(goods_url + "\n") print(goods_url) items_asin = re.findall(r'/dp/(.*?)/ref', goods_url, re.S)
def get_book_html(line): print('---------------------get_book_html begin----------------------') url = line.replace('\n', '') book_id = re.findall('http://product.half.ebay.com/.*?/(.*?)&.*?tg=info', url)[0] html = get_html.get_html(url) if html: with open('./result/' + opt[0] + '/book/' + book_id + '.html', 'w') as ff: ff.write(html) print('success:', url) else: with open('./result/' + opt[0] + '/get_book_html_fail.txt', 'aw') as fff: fff.write(line) with open('./result/' + opt[0] + '/get_book_info_fail.txt', 'aw') as ffff: ffff.write(line.replace('\n', '') + '.html' + '\n') print('fail:', url) print('---------------------get_book_html end----------------------')
def handle(itemsurl): try: #商品详情页 #获取每一个商品页面的html # html = requests.get(itemsurl).text html = get_html.get_html(itemsurl) # 获取每一个商品的asin # print html if html: #调用get_info函数,传入html get_info(html) print(html) else: with open('./Result/get_html_fail.txt', 'aw') as h: h.write(itemsurl + '\n') except Exception as e: # print itemsurl, ":", e with open('./Result/fail_url.txt', 'aw') as fail_url: fail_url.write(itemsurl + '\n')
def get_book_info(isbn): url = "http://search.half.ebay.com/" + isbn.replace( "\n", "") + "_W0QQmZbooksQQ_trksidZp2919Q2em1447Q2el2686" print(url) time.sleep(2.5) html = get_html.get_html(url) if html and -1 == html.find("No products found for"): if -1 == html.find("Security Measure"): try: isbn13 = '' isbn = re.findall( r'ISBN-10:</b>\s+<span class=""><a href=.*? class="pdplinks">(.*?)</a>', html, re.S) if isbn: isbn = isbn[0] else: isbn = '' isbn_13 = re.findall( r'ISBN-13:</b>\s+<span class=""><a href=.*? class="pdplinks">(.*?)</a>', html, re.S) if isbn_13: isbn13 = isbn_13[0] weight = "" we = re.findall( r'Weight:</td><td width="80%" valign="top">(.*?)</td>', html, re.S) try: auther_list = re.findall( r'<span class="pdplinks">(.*?)</a>', html, re.S) print(auther_list) auther = str(auther_list).split('class="pdplinks">')[1] print(auther) auther = auther.split('<')[0] auther.replace("']", '') except: auther_list = re.findall( r'<span class="pdplinks">(.*?)</span>', html, re.S) auther = re.findall(r'>(.*?)</a>', str(auther_list), re.S) auther = str(auther).replace("']", '') # auther = str(auther) if we: weight = we[0] if weight == "": #重量为空值,过滤掉 delete_file.write(isbn13 + "\n") delete_file.flush() else: weight_float = float(weight.replace("oz", "").strip()) if weight_float > 60 or weight_float == 0: #重量大于60或为0,过滤掉 delete_file.write(isbn13 + "\n") delete_file.flush() else: results = re.findall( '<h2 class="PDP_itemConditionTitle">(.*?)</h2>.*?<table cellpadding="0" cellspacing="0" class="PDP_itemList">(.*?)</table>', html, re.S) if results: condition = { "Brand New": "11", "Like New": "1", "Very Good": "2", "Good": "3", "Acceptable": "4" } for i in range(len(results)): book_info = [] shelf_info = [] #上架信息 book_info.append(isbn) book_info.append(isbn13) book_info.append(weight) book_info.append(auther) book_info.append(results[i][0]) #上架SKU sku = isbn + "_" + condition[results[i] [0]] + "_O_MM" shelf_info.append(sku) shelf_info.append(isbn) shelf_info.append("1") #对店铺做排除 prices = re.findall( '<td><span class="PDP_itemPrice">(.*?)</span></td>.*?<td><a class="PDP_sellerName" href=".*?">(.*?)</a></td>', str(results[i]), re.S) exclude_seller = [ 'alibris_books_01', 'alibris_books_02', 'alibris_books_03', 'alibris_books_04', 'alibris_books_05', 'alibris_books_06', 'alibris_books_07', 'alibris_books_08', 'alibris_books_09', 'alibris', 'alibris_movies', 'labsbooks11' ] price_float = 0.0 #采购价格 if len(prices) == 1: #condition下只有一个价格 price_float = float(prices[0][0].replace( "$", "").replace(",", "")) if prices[0][1] not in exclude_seller: book_info.append(prices[0][0]) else: break elif len(prices) == 2: #conditon下有两个价格 count = 0 for j in range(2): if prices[j][1] not in exclude_seller: book_info.append(prices[j][0]) orig_price = prices[j][ 0] #记录适合条件的最高价 count += 1 if count == 0: break else: #采购价格 price_float = float( orig_price.replace("$", "").replace( ",", "")) else: count = 0 for j in range(len(prices)): if prices[j][1] not in exclude_seller: book_info.append(prices[j][0]) orig_price = prices[j][ 0] #记录适合条件的最高价 count += 1 if count == 2: break if count == 0: break else: #采购价格 price_float = float( orig_price.replace("$", "").replace( ",", "")) #上架价格 if price_float > 1: p1 = (price_float + 3.99) * 1.5 p2 = price_float + 23.99 price = str(p1 > p2 and p1 or p2) shelf_info.append(price) shelf_info.append(price) # shelf_info.append(auther) shelf_info.append("8888800") shelf_info.append(condition[results[i][0]]) shelf_info.append("1") shelf_info.append("1") shelf_info.append("N") shelf_info.append("5") else: lock.acquire() delete_file.write("\t".join(book_info) + "\n") delete_file.flush() lock.release() break # #对店铺不做排除 # prices=re.findall('<td><span class="PDP_itemPrice">(.*?)</span></td>.*?<td><span class="PDP_itemPrice">(.*?)</span></td>',str(results[i]),re.S) # if prices: # book_info.append(prices[0][0]) # book_info.append(prices[0][1]) # else: # prices=re.findall('<td><span class="PDP_itemPrice">(.*?)</span></td>',str(results[i]),re.S) # if prices: # book_info.append(prices[0]) # book_info.append('') lock.acquire() result_file.write("\t".join(book_info) + "\n") result_file.flush() onshelf_file.write("\t".join(shelf_info) + "\n") onshelf_file.flush() lock.release() print("success:", isbn) lock.acquire() success_isbn_file.write(isbn13 + "\n") success_isbn_file.flush() lock.release() else: print("offshelf:", isbn) lock.acquire() offshelf_isbn_file.write(isbn13 + "\n") offshelf_isbn_file.flush() lock.release() except BaseException as e: print(isbn, e) logger2.exception(str(e)) else: lock.acquire() not_crawl_file.write(isbn) not_crawl_file.flush() lock.release() else: lock.acquire() not_list_file.write(isbn) not_list_file.flush() lock.release()
def get_book_isbn(url): try: time.sleep(2.5) html = get_html.get_html(url.replace("\n", "")) if html and -1 == html.find("Security Measure"): try: isbn13 = '' isbn = re.findall( 'ISBN-10:</b>\s+<span class=""><a href=.*? class="pdplinks">(.*?)</a>', html) if isbn: isbn = isbn[0] else: isbn = '' isbn_13 = re.findall( 'ISBN-13:</b>\s+<span class=""><a href=.*? class="pdplinks">(.*?)</a>', html) if isbn_13: isbn13 = isbn_13[0] results = re.findall( '<h2 class="PDP_itemConditionTitle">(.*?)</h2>.*?<table cellpadding="0" cellspacing="0" class="PDP_itemList">(.*?)</table>', html, re.S) if results: for i in range(len(results)): book_info = [] book_info.append(isbn) book_info.append(isbn13) book_info.append(results[i][0]) #对店铺做排除 prices = re.findall( '<td><span class="PDP_itemPrice">(.*?)</span></td>.*?<td><a class="PDP_sellerName" href=".*?">(.*?)</a></td>', str(results[i]), re.S) exclude_seller = [ 'alibris_books_01', 'alibris_books_02', 'alibris_books_03', 'alibris_books_04', 'alibris_books_05', 'alibris_books_06', 'alibris_books_07', 'alibris_books_08', 'alibris_books_09', 'alibris', 'alibris_movies' ] if len(prices) == 1: if prices[0][1] not in exclude_seller: book_info.append(prices[0][0]) elif len(prices) == 2: for i in range(2): if prices[i][1] not in exclude_seller: book_info.append(prices[i][0]) else: count = 0 for i in range(len(prices)): if prices[i][1] not in exclude_seller: book_info.append(prices[i][0]) count += 1 if count == 2: break # #对店铺不做排除 # prices=re.findall('<td><span class="PDP_itemPrice">(.*?)</span></td>.*?<td><span class="PDP_itemPrice">(.*?)</span></td>',str(results[i]),re.S) # if prices: # book_info.append(prices[0][0]) # book_info.append(prices[0][1]) # else: # prices=re.findall('<td><span class="PDP_itemPrice">(.*?)</span></td>',str(results[i]),re.S) # if prices: # book_info.append(prices[0]) # book_info.append('') lock.acquire() result_file.write("\t".join(book_info) + "\n") result_file.flush() lock.release() print("success:", isbn) lock.acquire() success_url_file.write(url) success_url_file.flush() lock.release() else: print("offshelf:", isbn) lock.acquire() offshelf_isbn_file.write(isbn + "\n") offshelf_isbn_file.flush() lock.release() except BaseException as e: print(isbn, e) logger2.exception(str(e)) else: lock.acquire() not_crawl_file.write(url) not_crawl_file.flush() lock.release() except Exception as e: print('error:', str(e)) logger2.error(str(e))
def get_info(html, itemsId): items_json = { 'registered_land': '', 'isbn': '', 'description': '', 'weight': '0.0000', 'ean': '', 'mpn': '', 'key_name': ' ', 'price': '', 'height': '0.0000', 'currency': 'USD', 'brand': '', 'length_class': '', 'product_id': '', 'category': '', 'jan': '', 'seller_id': '', 'name': '', 'keyword': '', 'weight_class': 'kg', 'url': '', 'key_attribute': '', 'detail': {}, 'shipping': '', 'orders': 0, 'reviews': '0', 'width': '0.0000', 'length': '0.0000', 'location': '', 'attributes': [{ 'price': '', 'variation_id': '', 'dictory': 'Ships From', 'attributes': { 'Ships From': '' }, 'image': [], 'quantity': 99 }], 'category_id_path': '', 'category_id': '', 'upc': '', 'image': [] } #读取价格字段存入 # price = re.findall("<meta itemprop='price' content='(.*?)'", html)[0] # items_json['price'] = price # print '----------------isbn--------------' items_json['isbn'] = itemsId # print '----------------price--------------' url = 'http://www.newegg.com/Product/MappingPrice2012.aspx?Item=' + itemsId html_price = get_html.get_html(url) price = re.findall( '<span class="price-was-data" style="display: none">(.*?)</span>', html_price) if price: price = price[0] else: price = 'None' items_json['price'] = price # print '----------------url--------------' items_json['url'] = url # print '----------------shipping--------------' ship = '0' ship = re.search(r'product_default_shipping_cost:\[(.*?)\]', html) if ship: ship = ship.group(1).replace("'", "") items_json['shipping'] = ship # print '----------------brand--------------' brand = '' brand = re.search(r'product_manufacture:\[(.*?)\]', html) if brand: brand = brand.group(1).replace("'", "") items_json['brand'] = brand # print '----------------name--------------' name = '' name_info = re.search(r'product_title:\[(.*?)\]', html) if name_info: name = name_info.group(1).replace("'", "").replace( '&', '').replace("#34;", "''").replace("#40;", "(").replace('#41;', ')').replace("#47;", "\\") name = name.decode("ascii").encode("utf-8") items_json['name'] = name # print '----------------weight--------------' weight_number = 0.0000 items_json['weight'] = weight_number # print '----------------weight_class--------------' weight_class = 'kg' items_json['weight_class'] = weight_class # print '----------------height--------------' height_number = 0.0000 items_json['height'] = height_number # print '----------------width--------------' width = 0.0000 items_json['width'] = width # print '----------------length_class--------------' length_class = 'cm' items_json['length_class'] = length_class # print '----------------product_id--------------' items_json['product_id'] = itemsId # print '----------------reviews-------------- reviews = '0' items_json['reviews'] = reviews # print '----------------upc--------------' upc = '' items_json['upc'] = upc # print '----------------seller_id--------------' sellerId = '' items_json['seller_id'] = sellerId # print '----------------detail----------详描----' Specification = {} spct_info = re.search(r'<div id="Specs" class=.*?>(.*?)</div>', html, re.S) if spct_info is not None: spct_info = spct_info.group(1) spct_list = re.findall(r'<dl><dt>(.*?)</dt><dd>(.*?)</dd></dl>', spct_info, re.S) if spct_list: for spct in spct_list: temp1 = re.sub(r'<[^>]+>', '', spct[0], re.S) temp2 = re.sub(r'<[^>]+>', '', spct[1], re.S) Specification[temp1] = temp2 items_json['detail'] = Specification # Specification = str(Specification).replace('{', '').replace('}', '').replace("'", '').replace(',','').replace('<br>','') # if len(items_info['Specification']) >= 2000: # items_info['Specification'] = items_info['Specification'][0:1999] image = [] image_info = re.search(r'"imageSetImageList":"(.*?)"', html, re.S) image_list = '' if image_info is not None: image_list = image_info.group(1) image_all = image_list.split(',') for images in image_all: images = 'http://images17.newegg.com/is/image/newegg/' + images image.append(images) if image_info is None: image_info = re.search(r'"imageNameList":"(.*?)"\}', html, re.S) if image_info is not None: image_list = image_info.group(1) image_all = image_list.split(',') for images in image_all: if images != '"dfis360ImgFlag":"': images = images.split('"')[0] images = 'http://images10.newegg.com/ProductImage/' + images image.append(images) items_json['image'] = image category_dict = { 'Computer Systems': 'ID-CS-503', 'Components': 'ID-C-504', 'Electronics': 'ID-E-505', 'Gaming': 'ID-G-506', 'Networking': 'ID-N-507', 'Office Solutions': 'ID-OS-508', 'Software Services': 'ID-SS-509', 'Automotive Industrial': 'ID-AI-510', 'Home Tools': 'ID-HT-511', 'Health Sports': 'ID-HS-512', 'Apparel Accessories': 'ID-AA-513', 'Hobbies Toys': 'ID-HT-514' } # print '----------------category--------------' category_string_html = re.findall( r'<div id="baBreadcrumbTop" style="max-width:1420px; margin:0px auto;">(.*?)</div>', html, re.S) category_html_list = re.findall(r'title="(.*?)"', str(category_string_html), re.S) print(category_html_list) category_string = '' for category_s in category_html_list[2:]: category_string = category_string + category_s + '>' items_json['category'] = category_string[:-1] # print '----------------category_id_path--------------' category_url_list = re.findall(r'href="(.*?)"', str(category_string_html), re.S) category_id_path = '' for url_path in category_url_list: url_path = url_path.split('?')[0] category_id_path = category_id_path + url_path.split('/')[-1] + '>' items_json['category_id_path'] = str( category_dict[category_html_list[2]]) + category_id_path[5:-1] # print '----------------category_id----------------' items_json['category_id'] = category_id_path.split('>')[-2] # print '----------------attributes----------------' items_json['attributes'][0]['price'] = str(price) items_json['attributes'][0]['variation_id'] = itemsId + '_' + itemsId items_json['attributes'][0]['image'] = image[0] # print '----------------description-------短描-------' details = [] description_str = '' detail_info = re.search(r'<ul class="itemColumn">(.*?)</ul>', html, re.S) if detail_info is not None: detail_info = detail_info.group(1) detail_info = detail_info.replace('\r\n', '').replace('\t', '') detail = re.findall(r'<li.*?>(.*?)</li>', detail_info, re.S) for i in detail: i = i.strip() i = i.decode("ascii").encode("utf-8") details.append(i) #函数get_feature_dict for description in details: description_str = description_str + description + ';' items_json['description'] = description_str # print '----------------写入文件----------------' json_file = json.dumps(items_json) result_file.write(json_file + '\n') print('=============') print(items_json) result_file.flush()
def get_info(items_info): # 得到字段信息 # items_result={'itemsId':'','price':'','ship':'','stock':'','image1':'','brand':'','title':'','product_attr':'','description':'','Specification':''} items_result = { 'itemsId': '', 'price': '', 'ship': '', 'stock': '', 'image1': '', 'brand': '', 'title': '', 'product_attr': '', 'Specification': '' } price = '' if items_info.has_key('salePrice'): price = items_info['salePrice'] items_result['price'] = price sku = '' if items_info.has_key('itemId'): sku = items_info['itemId'] items_result['itemsId'] = sku items_url = 'http://www.walmart.com/ip/' + str(sku) item_html = '' try: items_html = get_html.get_html(items_url) except Exception as e: print(e) stock = '' if items_info.has_key('stock'): stock_info = items_info['stock'] if stock_info == 'Not Available' or stock_info == 'Not available': stock = 'out of stock' if stock_info == 'Available': stock = 'in stock' items_result['stock'] = stock # description='' # if items_info.has_key('longDescription'): # feature_all=items_info['longDescription'].strip() # html = HTMLParser.HTMLParser() # feature_html = html.unescape(feature_all) # feature_info=re.sub(r'<[^>]+>', '', feature_html) # description=feature_info # if len(description)>=2000:#判断,如果字符大于2000个则取前2000个 # description=description[:2000] # items_result['description']=description ship = '0' if items_info.has_key('standardShipRate'): ship = items_info['standardShipRate'] items_result['ship'] = ship feature = '' if items_info.has_key('shortDescription'): feature = items_info['shortDescription'] items_result['feature'] = feature title = '' if items_info.has_key('name'): title = items_info['name'] items_result['title'] = title brand = '' if items_info.has_key('brandName'): brand = items_info['brandName'] else: brand_info = re.search(r'<span itemprop=brand>(.*?)</span>', items_html, re.S) if brand_info != None: brand = brand_info.group(1) items_result['brand'] = brand reviews = '' if items_info.has_key('numReviews'): reviews = items_info['numReviews'] items_result['reviews'] = reviews product_attr = '' if items_info.has_key('attributes'): product_attr = items_info['attributes'] items_result['product_attr'] = product_attr feature_list = [] if items_info.has_key('shortDescription'): short_description_info = items_info['shortDescription'] html = HTMLParser.HTMLParser() short_description_html = html.unescape(short_description_info) short_description = re.sub(r'<[^>]+>', '', short_description_html) feature_list = short_description.split('.') feature_dic = get_feature_dict(feature_list) Specifications = {} Specifications_list = re.findall( r'<tr class=js-product-specs-row>[\s]*<td>(.*?)</td>[\s]*<td>(.*?)</td>[\s]*</tr>', items_html, re.S) if Specifications_list != []: for Specifications_info in Specifications_list: tmp1 = re.sub(r'<[^>]+>', '', Specifications_info[0], re.S) tmp2 = re.sub(r'<[^>]+>', '', Specifications_info[1], re.S) Specifications[tmp1] = tmp2 if len(str(Specifications)) > 1000: Specifications = str(Specifications)[:1000] items_result['Specification'] = Specifications if items_info.has_key('largeImage'): image = items_info['largeImage'] items_result['img1'] = image items_dic = dict(items_result.items() + feature_dic.items()) return items_dic
# data = gzipper.read() # html = data # return html # except Exception as e: # print 'error:', str(e) # logger2.error(str(e)) # get_html(url) ''' 获取书籍品类url ''' if __name__ == '__main__': if os.path.exists('./result')==0: os.mkdir('./result') logger1.info('get category url begin') with open('./result/cate_url.txt','aw') as f: try: f.truncate(0) base_category_url = 'http://books.half.ebay.com/' html=get_html.get_html(base_category_url) separator=re.findall('<div class="separator">.*?<ul class="metascateitems">(.*?)</ul>',html,re.S)[0].replace('\n','').replace('\t','').replace('\r','') cate_urls=re.findall('<li><a href="(.*?)">(.*?)</a></li>',separator) for cate_url in cate_urls: url=cate_url[0] cate=cate_url[1].replace('&','&') f.write(url + '|'+cate + '\n') print ('-----------get category url successfuly--------------') except Exception as e: print ('error:', str(e)) logger2.error(str(e)) logger1.info('get category url success')
def get_info(html, itemsId): items_info = { 'itemsId': itemsId, 'price': '', 'Original_price': '', 'ship': '', 'stock': '', 'brand': '', 'title': '', 'Specification': '' } #读取价格字段存入 # print '---------------------price----------------------' price = re.findall("<meta itemprop='price' content='(.*?)'", html)[0] items_info['price'] = price #售价页 # print '---------------------Original_price----------------------' url = 'http://www.newegg.com/Product/MappingPrice2012.aspx?Item=' + itemsId html_price = get_html.get_html(url) orgin_price = re.findall( '<span class="price-was-data" style="display: none">(.*?)</span>', html_price) if orgin_price: orgin_price = orgin_price[0] else: orgin_price = 'None' items_info['Original_price'] = orgin_price # print '---------------------ship----------------------' ship = '0' ship = re.search(r'product_default_shipping_cost:\[(.*?)\]', html) if ship: ship = ship.group(1).replace("'", "") items_info['ship'] = ship # print '---------------------stock----------------------' stock = '' stock_info = re.search(r'product_instock:\[(.*?)\]', html) if stock_info: stock_info = stock_info.group(1).replace("'", "") if int(stock_info) == 1: stock = 'In stock' else: stock = 'out of stock' items_info['stock'] = stock # print '---------------------brand----------------------' brand = '' brand = re.search(r'product_manufacture:\[(.*?)\]', html) if brand: brand = brand.group(1).replace("'", "") items_info['brand'] = brand # print '---------------------name----------------------' name = '' name_info = re.search(r'product_title:\[(.*?)\]', html) if name_info: name = name_info.group(1).replace("'", "").replace( '&', '').replace("#34;", "''").replace("#40;", "(").replace('#41;', ')').replace("#47;", "\\") name = name.decode("ascii").encode("utf-8") items_info['title'] = name # print '---------------------Specification----------------------' Specification = {} spct_info = re.search(r'<div id="Specs" class=.*?>(.*?)</div>', html, re.S) if spct_info is not None: spct_info = spct_info.group(1) spct_list = re.findall(r'<dl><dt>(.*?)</dt><dd>(.*?)</dd></dl>', spct_info, re.S) if spct_list: for spct in spct_list: temp1 = re.sub(r'<[^>]+>', '', spct[0], re.S) temp2 = re.sub(r'<[^>]+>', '', spct[1], re.S) Specification[temp1] = temp2 items_info['Specification'] = str(Specification).replace('{', '').replace( '}', '').replace("'", '').replace(',', '<br>') if len(items_info['Specification']) >= 2000: items_info['Specification'] = items_info['Specification'][0:1999] # print '---------------------image----------------------' image = [] image_info = re.search(r'"imageSetImageList":"(.*?)"', html, re.S) image_list = '' if image_info is not None: image_list = image_info.group(1) image_all = image_list.split(',') for images in image_all: images = 'http://images17.newegg.com/is/image/newegg/' + images image.append(images) if image_info is None: image_info = re.search(r'"imageNameList":"(.*?)"\}', html, re.S) if image_info is not None: image_list = image_info.group(1) image_all = image_list.split(',') for images in image_all: if images != '"dfis360ImgFlag":"': images = images.split('"')[0] images = 'http://images10.newegg.com/ProductImage/' + images image.append(images) image_dict = get_img_dict(image) # print '---------------------details----------------------' details = [] detail_info = re.search(r'<ul class="itemColumn">(.*?)</ul>', html, re.S) if detail_info is not None: detail_info = detail_info.group(1) detail_info = detail_info.replace('\r\n', '').replace('\t', '') detail = re.findall(r'<li.*?>(.*?)</li>', detail_info, re.S) for i in detail: i = i.strip() i = i.decode("ascii").encode("utf-8") details.append(i) #函数get_feature_dict detail_dict = get_feature_dict(details) items_dict = dict(items_info.items() + image_dict.items() + detail_dict.items()) # 所有字典合并 for k in titles: if items_dict.has_key(k): value = items_dict.get(k) else: value = 'None' lock.acquire() result_file.write(str(value) + "\t") result_file.flush() lock.release() lock.acquire() result_file.write('\n') result_file.flush() lock.release()
def get_category(): print('') global category_file category_file = open('./result/category.txt', 'w') category_id_file = open('./result/category_id.txt', 'w') file = { '1': 'I ', '2': 'II ', '3': 'III ', '4': 'IV ', '5': 'V ' } key = [ 'f7fqv4jzcdr7ccfb2b339cv9', 'fmwnnrwf53d6c5sw7b4pu2q3', '7sd4rpjfmdurwuwzgvpbffd2', 'jqpyjz92jmaruene4mpbe8pc', 'z2pqv4dtuwhxe3hkesx9kqpv', 'nz2gzu5byp9dbnm6jee69jkp', 'sfpw74s5yte8dj9r9atzyc5m ', '2tk5sghn56mnth5uabspkdt6' ] url = 'http://api.walmartlabs.com/v1/taxonomy?apiKey=2tk5sghn56mnth5uabspkdt6' info = get_html.get_html(url) info = eval(info) categories = info['categories'] print(len(categories)) for category in categories: category1 = category['name'] category1_id = category['id'] category_file.write(file['1'] + category1 + '\t' + category1_id + '\n') category_file.flush() category2_info = category['children'] print(category2_info) for category2_all in category2_info: category2 = category2_all['name'] category2_id = category2_all['id'] category_file.write('\t' + file['2'] + category2 + '\t' + category2_id + '\n') category_file.flush() category3_info = category['children'] for category3_all in category3_info: category3 = category3_all['name'] category3_id = category3_all['id'] category_file.write('\t\t' + file['3'] + category3 + '\t' + category3_id + '\n') category_file.flush() if category3_all.has_key('children'): category4_info = category3_all['children'] for category4_all in category4_info: category4 = category4_all['name'] category4_id = category4_all['id'] category_file.write('\t\t\t' + category4 + '\t' + category4_id + '\n') category_file.flush() category_id_file.write(category4_id + '\n') category_id_file.flush() if category4_all.has_key('children'): category5_info = category4_all['children'] print(category5_info) else: category_id_file.write(category3_id + '\n') category_id_file.flush() category_file.close() category_id_file.close()
def get_info(items_info): # 得到字段信息 # items_result = {'itemsId': '', 'price': '', 'ship': '', 'stock': '', 'image1': '', 'brand': '', 'title': '', # 'product_attr': '', 'Specification': ''} items_json = { 'registered_land': '', 'isbn': '', 'description': '', 'weight': '0.0000', 'ean': '', 'mpn': '', 'key_name': ' ', 'price': '', 'height': '0.0000', 'currency': 'USD', 'brand': '', 'length_class': '', 'product_id': '', 'category': '', 'jan': '', 'seller_id': '', 'name': '', 'keyword': '', 'weight_class': 'kg', 'url': '', 'key_attribute': '', 'detail': {}, 'shipping': '', 'orders': 0, 'reviews': '0', 'width': '0.0000', 'length': '0.0000', 'location': '', 'attributes': [{ 'price': '', 'variation_id': '', 'dictory': 'Ships From', 'attributes': { 'Ships From': '' }, 'image': [], 'quantity': 99 }], 'category_id_path': '', 'category_id': '', 'upc': '', 'image': [] } # print '----------------isbn--------------' sku = '' if items_info.has_key('itemId'): sku = items_info['itemId'] items_json['isbn'] = sku # print '----------------product_id--------------' items_json['product_id'] = sku # print '----------------price--------------' price = '' if items_info.has_key('salePrice'): price = items_info['salePrice'] items_json['price'] = str(price) items_url = 'http://www.walmart.com/ip/' + str(sku) item_html = '' try: items_html = get_html.get_html(items_url) except Exception as e: print(e) # stock = '' # if items_info.has_key('stock'): # stock_info = items_info['stock'] # if stock_info == 'Not Available' or stock_info == 'Not available': # stock = 'out of stock' # if stock_info == 'Available': # stock = 'in stock' # items_result['stock'] = stock # description='' # if items_info.has_key('longDescription'): # feature_all=items_info['longDescription'].strip() # html = HTMLParser.HTMLParser() # feature_html = html.unescape(feature_all) # feature_info=re.sub(r'<[^>]+>', '', feature_html) # description=feature_info # if len(description)>=2000:#判断,如果字符大于2000个则取前2000个 # description=description[:2000] # items_result['description']=description # print '----------------shipping--------------' ship = '0' if items_info.has_key('standardShipRate'): ship = items_info['standardShipRate'] items_json['shipping'] = ship # feature = '' # if items_info.has_key('shortDescription'): # feature = items_info['shortDescription'] # items_result['feature'] = feature # print '----------------name--------------' title = '' if items_info.has_key('name'): title = items_info['name'] items_json['name'] = title # print '----------------brand--------------' brand = 'no brand' if items_info.has_key('brandName'): brand = items_info['brandName'] else: brand_info = re.search(r'<span itemprop=brand>(.*?)</span>', items_html, re.S) if brand_info != None: brand = brand_info.group(1) items_json['brand'] = brand # print '----------------category--------------' category_path = items_info['categoryPath'] category_string_path = str(category_path).replace('/', '>') items_json['category'] = category_string_path # print '----------------seller_id--------------' sellerId = '' items_json['seller_id'] = sellerId # print '----------------reviews-------------- reviews = '' if items_info.has_key('numReviews'): reviews = items_info['numReviews'] items_json['reviews'] = reviews # print '----------------category_id_path--------------' category_id_path = items_info['categoryNode'] category_id_path = str(category_id_path).replace('_', '>') items_json['category_id_path'] = category_id_path[:-1] # print '----------------category_id----------------' category_id = category_id_path.split('_')[-1] items_json['category_id'] = category_id # print '----------------upc--------------' items_json['upc'] = items_info['upc'] # product_attr = '' # if items_info.has_key('attributes'): # product_attr = items_info['attributes'] # items_result['product_attr'] = product_attr # print '----------------description-------短描-------' feature_list = [] description_str = '' if items_info.has_key('shortDescription'): short_description_info = items_info['shortDescription'] html = HTMLParser.HTMLParser() short_description_html = html.unescape(short_description_info) short_description = re.sub(r'<[^>]+>', '', short_description_html) feature_list = short_description.split('.') for description in feature_list: description_str = description_str + description + ';' items_json['description'] = description_str # feature_dic = get_feature_dict(feature_list) # print '----------------weight--------------' weight_number = 0.0000 items_json['weight'] = weight_number # print '----------------height--------------' height_number = 0.0000 items_json['height'] = height_number # print '----------------width--------------' width = 0.000 items_json['width'] = width # print '----------------length_class--------------' length_class = 'cm' items_json['length_class'] = length_class # print '----------------weight_class--------------' weight_class = 'kg' items_json['weight_class'] = weight_class # print '----------------url--------------' url = items_info['productUrl'] # url = items_info['url'] items_json['url'] = str(url) # print '----------------detail----------详描----' Specifications = {} Specifications_list = re.findall( r'<tr class=js-product-specs-row>[\s]*<td>(.*?)</td>[\s]*<td>(.*?)</td>[\s]*</tr>', items_html, re.S) if Specifications_list != []: for Specifications_info in Specifications_list: tmp1 = re.sub(r'<[^>]+>', '', Specifications_info[0], re.S) tmp2 = re.sub(r'<[^>]+>', '', Specifications_info[1], re.S) Specifications[tmp1] = tmp2 items_json['detail'] = str(Specifications) # print '----------------image----------------' image = '' if items_info.has_key('largeImage'): image = items_info['largeImage'] items_json['image'] = image # print '----------------attributes----------------' items_json['attributes'][0]['price'] = str(price) variation_id = str(items_info['parentItemId']) + '_' + str( items_info['itemId']) items_json['attributes'][0]['variation_id'] = str(variation_id) items_json['attributes'][0]['image'] = image[0] # "attributes": [ # {"price": "", "variation_id": "", "dictory": "Ships From", "attributes": {"Ships From": ""}, "image": [], # "quantity": 99}] json_file = json.dumps(items_json) # items_dic = dict(items_result.items() + feature_dic.items()) return json_file