def parse_category(self, category_list_path, link, content): tree = lxml.html.fromstring(content) items = tree.xpath('//div[@class="narrowcontent"]//a[@class="headlink"]') if not items: try: total_num = tree.xpath('//div[@id="rightcol"]//div[@id="top-padbar"]/div/strong[2]')[0].text self.conn.set_leaf_category(category_list_path, int(total_num)) except: log.log_traceback(self.logger_category, '!* Do not get how many items of this category: {0}'.format(link)) for item in items: category_link = item.get('href') url = self.siteurl + category_link if category_link else '' category = item.text.strip('\n') # trap "Household Insulation": # http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A228013%2Cn%3A!468240%2Cn%3A551240%2Cn%3A495346&bbn=495346&ie=UTF8&qid=1344909516 if category in category_list_path: continue cats = category_list_path + [category] log.log_print('category ==> {0}'.format(cats), self.logger_category) if self.conn.category_updated(' > '.join(cats)): continue if url: log.log_print('url ==> {0}'.format(url), self.logger_category) else: log.log_print('url ==> {0}'.format(url), self.logger_category, logging.ERROR) log.log_print('queue size ==> {0}'.format(self.queue.qsize()), self.logger_category) self.conn.insert_update_category(cats, url) self.queue.put((cats, url))
def parse_category(self, category_list_path, url, content): tree = lxml.html.fromstring(content) node = tree.xpath('//div[@id="siteContent"]//div[@class="layoutLeftColumn"]//div[@class="leftnav_content"]') if node: node = node[0] else: log.log_traceback(self.logger_category, 'Url can not be parsed {0}'.format(url)) return while node.xpath('./ul/li[@class="active"]'): node = node.xpath('./ul/li[@class="active"]')[0] items = node.xpath('./ul/li') if not items: showing = tree.xpath('//div[@id="siteContent"]//div[@class="layoutCenterColumn"]/div[@class="pagination"]//td[@class="showing"]/text()')[0] num = showing.split('of')[-1].split('total')[0].strip() cate = self.get_orm_category(url) cate.num = int(num) cate.is_leaf = True cate.save() else: for item in items: l = item.xpath('./a/@href')[0] link = l if l.startswith('http') else 'http://www.cabelas.com' + l category = item.xpath('./a/text()')[0] cate = self.get_orm_category(link) cate.cats = category_list_path + [category] cate.save() log.log_print('category ==> {0}'.format(cate.cats), self.logger_category) log.log_print('queue size ==> {0}'.format(self.queue.qsize()), self.logger_category) self.queue.put((cate.cats, link))
def parse_listing(self, catstr, url, content): """ ITEM_PER_PAGE: 25 in setting.py """ tree = lxml.html.fromstring(content) try: num = tree.xpath('//div[@class="tMain"]//div[@id="Plistbar"]/div[@id="PfirstRow"]/span[@id="PitemNum"]/text()')[0] total_num = int(num.replace('\t', '').strip().split('\n')[-1].split()[0]) except: log.log_traceback(self.logger_list, '!* Do not get item numbers of category [{0}]: {1}'.format(catstr, url)) # print content time.sleep(30) return log.log_print('{0} items in {1}'.format(total_num, url), self.logger_list) page_num = (total_num - 1) // ITEM_PER_PAGE + 1 if page_num == 0: log.log_print('Listing page do not have any items! -- {0}'.format(url), self.logger_list, logging.ERROR) elif page_num == 1: self.get_info(url, catstr, total_num, page_num) else: part1 = '/'.join( url.split('/')[:-2] ) part2 = '/'.join( url.split('/')[-2:] ) for i in xrange(1, page_num): self.get_info(part1 + '/pn/{0}/'.format(i) + part2, catstr, ITEM_PER_PAGE, i) time.sleep(0.5) self.get_info(part1 + '/pn/{0}/'.format(page_num) + part2, catstr, total_num % ITEM_PER_PAGE, page_num) time.sleep(0.5)
def parse_product(self, sku, url, content): tree = lxml.html.fromstring(content.decode('utf-8','replace')) try: node = tree.xpath('//div[@id="content"]/div[@id="pdpcenterwell"]')[0] except: log.log_traceback(self.logger_product, 'Product have problem when parsing: {0}'.format(url)) return #item = node.xpath('.//div[@id="productsummary"]/div[@id="financing"]//li/a/text()') # will add a \n to the tail of every field offer= node.xpath('.//div[@id="productsummary"]/div[@id="financing"]//li/a') #['\n18-Month Financing', '\nGet 4% Back in Rewards: See How'] if offer: offers = [a.text_content().strip() for a in offer] else: offers = [] specifications = [] try: specifications = node.xpath('.//div[@id="productdetail"]/div[@id="pdptabs"]/div[@id="tabbed-specifications"]//li/div//text()') # if specification: # # spec = [] # for a in specification: # if 'Customer Reviews' in a: # break # if a != '\n': # spec.append( a.strip('\n') ) # # length = len(spec) # i = 0 # key = '' # while i < length: # if i + 2 < length and spec[i+2] == ' ': # # mulit value pair: ['software include', 'vim', ' ', 'emacs', ' ', 'process', 'intel'] # if key: # specifications[key].append(spec[i+1]) # else: # specifications[ spec[i] ] = [ spec[i+1] ] # key = spec[i] # # print key, specifications[key] # else: # # normal pair: ['cpu', 'amd', 'brand', 'dell'] # key = '' # if spec[i] is ' ' or spec[i] is '': # # ['Estimated Yearly Operating Cost', '$17', '', 'UPC', '600603146435', ' ', ''] # i += 1 # if i >= length: # last item is ' ' # break # continue # else: # specifications[ spec[i] ] = spec[i+1] # # print spec[i], specifications[ spec[i] ] # i += 2 except: log.log_traceback(self.logger_product, 'Product specifications parsing problem: {0}'.format(url)) self.conn.update_product(sku, offers, specifications)
def cycle_crawl_category(self, timeover=60): while not self.queue.empty(): try: job = self.queue.get(timeout=timeover) utf8_content = self.fetch_page(job[1]) self.parse_category(job[0], job[1], utf8_content) except Queue.Empty: log.log_traceback(self.logger_category, 'Queue waiting {0} seconds without response!'.format(timeover)) except: log.log_traceback(self.logger_category)
def parse_product(self, bah, url, content): tree = lxml.html.fromstring(content) try: node = tree.xpath('//div[@class="tMain"]//div[@id="productAllWrapper"]/div[@id="productMainWrapper"]') if node: node = node[0] else: return except: log.log_traceback(self.logger_product, 'Product have problem when parsing: {0}'.format(url)) bill_later = node.xpath('.//div[@id="productRight"]//div[contains(@class, "altPayment findLast")]//li/a//text()') bill_later = [a.strip() for a in bill_later] info = [] buy_together = tree.xpath('.//div[@class="productInfoArea adm findLast"]/a/@href') if buy_together: content = self.fetch_page(buy_together[0]) intree = lxml.html.fromstring(content) ones = intree.xpath('//div[@class="ui-dialog-content"]//div[@class="col titleDetails"]') for one in ones: title = one.xpath('./div[@class="title"]/span//text()') info = [t.strip() for t in title if t.strip()] model = one.xpath('./div[@class="details"]/p/text()') info.append(model[0]) buy_together = info specifications = {} tables = node.xpath('.//div[@id="bottomWrapper"]//div[@id="Specification"]//table[@class="specTable"]') for table in tables: key = table.xpath('.//tr/td[@class="specTopic"]') value = table.xpath('.//tr/td[@class="specDetail"]') k = [k.text_content().strip() for k in key] v = [v.text_content().strip().replace('\n', '') for v in value] specifications.update( dict(zip(k, v)) ) in_box = node.xpath('.//div[@id="bottomWrapper"]//div[@id="WhatsInTheBox"]/ul/li') in_box = [a.text_content().strip() for a in in_box] rating = node.xpath('.//div[@id="bottomWrapper"]//div[@id="costumerReview"]//div[@class="pr-snapshot-rating rating"]/span/text()') if rating: rating = rating[0] else: rating = '' # items = node.xpath('.//div[@id="bottomWrapper"]//div[@class="accGroup "]//form[@class="addToCartForm"]//div[@class="accDetails"]') # for item in items: # title = item.xpath('./div[1]') # title[0].text_content() # model = item.xpath('./div[@class="ItemNum"]/span') # model[0].text_content() self.conn.update_product(bah, bill_later, specifications, in_box, rating, buy_together) time.sleep(1)
def parse_listing(self, url, catstr, tree, page_num, num_in_this_url): """ ITEM_PER_PAGE: 48 in settings.py """ try: nodes = tree.xpath('//div[@id="siteContent"]//div[@class="layoutCenterColumn"]/div[@class="itemsWrapper"]/div[@class="resultsColumn"]//div[@class="itemEntryInner"]') except: log.log_traceback(self.logger_list, 'Did not parse node: {0}'.format(url)) # log.log_print('content: {0}'.format(content), self.logger_list, logging.DEBUG) return if len(nodes) != num_in_this_url: log.log_traceback(self.logger_list, '{0} num_in_this_url: {1}, actual_num: {2}'.format(url, num_in_this_url, len(nodes)) ) timenow = datetime.utcnow() for j in xrange(len(nodes)): price = nodes[j].xpath('.//div[@class="price"]/div/div[@class="textSale"]/text()') if not price: price = nodes[j].xpath('.//div[@class="price"]/div/div/text()') if not price: price = '' log.log_traceback(self.logger_list, 'do not get price {0} {1}'.format(url, j)) else: price = price[0] t = nodes[j].xpath('.//id/a[@class="itemName"]')[0] title = t.text_content() if not title: title = '' log.log_traceback(self.logger_list, 'do not get title {0} {1}'.format(url, j)) l = t.get('href') if not l: link = '' log.log_traceback(self.logger_list, 'do not get link {0} {1}'.format(url, j)) else: link = l if l.startswith('http') else 'http://www.cabelas.com' + l # This rank changes all the time.If some product updated,some not, same rank on two products will happen! sell_rank = ITEM_PER_PAGE * (page_num-1) + j + 1 itemID = re.compile(r'.*/(\d+).uts?.*').match(link).group(1) product = Product.objects(itemID=itemID).first() if not product: product = Product(itemID=itemID) product.title = title product.sell_rank = sell_rank product.list_update_time = timenow product.price = price.replace('$', '').replace(',', '') product.updated = False if product.catstrs == []: product.catstrs.append(catstr) elif catstr not in product.catstrs: product.catstrs.append(catstr) product.save()
def load_data(): """ Load all table data into the search index. """ try: loader.download() load_table_data() status = 'loaded' except Exception as ex: log.log_traceback(ex) status = 'failed' return flask.jsonify({'status': status})
def parse_category(self, category_list_path, url, content): tree = lxml.html.fromstring(content) items = tree.xpath('//div[@class="tMain"]//div[@class="column"]//li/a') if not items: # second kind of category page items = tree.xpath('//div[@class="tMain"]//table[@class="catColumn"]//tr[@valign="top"]//a') if items: links = [item.get('href') for item in items] names = [item.text_content() for item in items] else: # third kind of category page items = tree.xpath('//div[@id="mainContent"]//div[@class="categoryGroup staticBody"]/div/a') if items: links = [item.get('href') for item in items] names = [item.xpath('.//img')[0].get('alt') for item in items] else: links, names = [], [] try: # listing page num = tree.xpath('//div[@class="tMain"]//div[@id="Plistbar"]/div[@id="PfirstRow"]/span[@id="PitemNum"]/text()')[0] total_num = int(num.replace('\t', '').strip().split('\n')[-1].split()[0]) self.conn.set_leaf_category(category_list_path, total_num) except: log.log_traceback( self.logger_category, '!! {0} neither a category nor a listing page.Or other errors.'.format(url) ) log.log_print('content: {0}'.format(content), self.logger_category, logging.DEBUG) return pairs = [] if len(links) != len(names): log.log_traceback(self.logger_category, '!! links num: {0}; names num: {1}. {2}'.format(len(links), len(names), url)) return else: pairs = zip(names, links) for category, link in pairs: # trap "Household Insulation": # http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A228013%2Cn%3A!468240%2Cn%3A551240%2Cn%3A495346&bbn=495346&ie=UTF8&qid=1344909516 if category_list_path[-1] == category: continue cats = category_list_path + [category] log.log_print('category ==> {0}'.format(cats), self.logger_category) if self.conn.category_updated(' > '.join(cats)): continue log.log_print('queue size ==> {0}'.format(self.queue.qsize()), self.logger_category) self.conn.insert_update_category(cats, link) self.queue.put((cats, link))
def dump_file(self, variable, directory, file_name): ''' dump variable to file. ''' full_name = os.path.join(directory, file_name) if os.path.isfile(full_name): os.rename(full_name, full_name + '_bak') with open(full_name, 'wb') as f: try: pickle.dump(variable, f) except: log.log_traceback(msg='dump to ' + full_name + ' error:') return False return True
def run(self): ''' the get-and-do loop''' while True: try: callable, args, kwargs = self.work_queue.get(timeout=self.timeout) result = callable(*args, **kwargs) if result: self.result_queue.put(result) # only add url_graph result, filter indexing result # print('worker[%d]: %s' % (self.id, str(result))) # formally need task_done(), but we can omit it here. self.work_queue.task_done() except queue.Empty: break except: message = 'worker[{0}]'.format(self.id) log.log_traceback(msg=message)
def load_file(self, directory, file_name): ''' load dump file to variables. ''' full_name = os.path.join(directory, file_name) open_file = full_name + '_bak' if not os.path.isfile(open_file): if not os.path.isfile(full_name): print('No dump file exist in ', directory, file_name) return False else: open_file = full_name with open(open_file, 'rb') as f: try: variable = pickle.load(f) except: log.log_traceback(msg='load dump file from ' + open_file + ' error:') return False return variable
def check_dir(directory): ''' Check whether directory exist. If so, make sure it is not a file. If not, create one. ''' if os.path.exists(directory): if not os.path.isdir(directory): print(directory, 'is not a directory!') return False else: try: os.makedirs(directory) except OSError: log.log_traceback(msg=directory) return False return True
def get(self, url): try: http = httplib2.Http(timeout=10) response, content = http.request(url, 'GET') if response['status'] != '200': return if content == b'Access Denied': return encode = '' for item in response['content-type'].lower().split(';'): if 'charset' in item: encode = item.split('=')[1] if not encode: re_sh = self.pat.search(content) if re_sh: encode = re_sh.group(3).decode() if not encode: encode = 'utf-8' return content.decode(encode, 'ignore') except UnicodeDecodeError: self.logger.debug('get(): ', url, response) log.log_traceback(self.logger) except: log.log_traceback(self.logger)
def parse_listing(self, catstr, url, content): """ parse listing page to get each product -> db """ item_per_page = 15 tree = lxml.html.fromstring(content) try: total_num = tree.xpath('//div[@id="rightcol"]//div[@id="top-padbar"]/div/strong[2]')[0].text except: log.log_traceback(self.logger_list, '!* Do not get how many items of this category: {0}'.format(url)) return num = int(total_num) log.log_print('{0} items in {1}'.format(num, url), self.logger_list) page_num = (num - 1) // item_per_page + 1 if page_num == 0: log.log_print('Listing page do not have any items! -- {0}'.format(url), self.logger_list, logging.ERROR) elif page_num == 1: self.get_info(url, catstr, num, page_num, item_per_page) else: for i in xrange(1, page_num): self.get_info('{0}&gf=y&cp={1}'.format(url, i), catstr, item_per_page, i, item_per_page) self.get_info('{0}&gf=y&cp={1}'.format(url, page_num), catstr, num % item_per_page, page_num, item_per_page)
pstem = porter_stemming.PorterStemmer() # cache cache_stem = cache.cache() cache_search = cache.cache() # for search cycle while True: cache_stem.pop_onethird() cache_search.pop_onethird() try: words = input('Please input words to search: ') except EOFError: print('Ctrl-d, program exit.') exit(0) except: log.log_traceback(logger) exit(0) logger.info('search: %s', words) if cache_stem.have_key(str(words)): query = cache_stem.get_by_key(str(words)) else: query = pstem.controling(words) # a words list cache_stem.add_kv(str(words), query) if cache_search.have_key(str(query)): # unhashable type: list result = cache_search.get_by_key(str(query)) else: result = search.multi_search(index, ranks, query) cache_search.add_kv(str(query), result) if not result: print('Sorry, the engine can not find what you want.')
def parse_product(self, url, content, itemID): tree = lxml.html.fromstring(content) try: node = tree.xpath('//div[@id="siteContent"]//div[@id="productDetailsTemplate"]/div[@class="layoutWithRightColumn"]')[0] except: log.log_traceback(self.logger_product, 'Parsing page problem: {0}'.format(url)) return timenow = datetime.utcnow() also_like = [] like = node.xpath('./div[@class="layoutRightColumn"]/div[@class="youMayAlsoLike"]//div[@class="item"]//a[@class="itemName"]') for l in like: link = l.get('href') if l.get('href').startswith('http') else 'http://www.cabelas.com' + l.get('href') also_like.append( (l.text_content(), link) ) # img = node.xpath('./div[@class="layoutCenterColumn"]/div[@class="js-itemImageViewer itemImageInclude"]/img/@src') img = tree.xpath('/html/head/meta[@property="og:image"]/@content') if not img: log.log_traceback(self.logger_product, 'Page donot have a image: {0}'.format(url)) info = node.xpath('./div[@class="layoutCenterColumn"]/div[@id="productInfo"]') if not info: log.log_traceback(self.logger_product, 'Page donot have a info: {0}'.format(url)) return else: info = info[0] available = info.xpath('.//div[@class="variantConfigurator"]//div[@class="stockMessage"]/span/text()') if not available: if info.xpath('.//div[@class="variantConfigurator"]//div[@class="js-availabilityMessage"]'): m = re.compile(r"ddWidgetEntries\['js-vc13280170'] =(.*), values ").search(content) # http://www.cabelas.com/product/746407.uts if m: jsid = m.group(1).split(':')[-1].strip() post_data = { 'productVariantId': jsid, } jsurl = 'http://www.cabelas.com/catalog/includes/availabilityMessage_include.jsp' sess = requests.Session() resp_cont = sess.post(jsurl, data=post_data).content available = re.compile(r'<span class="availabilityMessage">(.*)</span>').search(resp_cont).group(1) price = info.xpath('.//div[@class="price"]/dl[@class="salePrice"]/dd[1]/text()') if not price: price = info.xpath('.//div[@class="price"]/dl[1]/dd[1]/text()') if not price: avail = info.xpath('.//div[@class="variantConfigurator"]/span[@class="soldOut"]/text()') if avail == ['Sold Out']: available = 'Sold Out' log.log_print('Page donot have a price: {0}'.format(url), self.logger_product, logging.WARNING) itemNO = info.xpath('.//div[@class="variantConfigurator"]//span[@class="itemNumber"]/text()') # this xpath need strip() if not itemNO: itemNO = tree.xpath('//div[@id="siteContent"]//div[@class="w100"]/meta[1]/@content') if not itemNO: log.log_traceback(self.logger_product, 'Page donot have a itemNO: {0}'.format(url)) else: itemNO = itemNO[0].strip() ship = info.xpath('.//div[@class="bottomNote"]//td/img/@alt') if ship and ship[0] == 'In-Store Pick Up': shipping = 'free shipping' else: shipping = '' desc = node.xpath('./div[@class="layoutCenterColumn"]/div[@id="tabsCollection"]//div[@id="description"]') rating, review = '', '' if node.xpath('./div[@class="layoutCenterColumn"]/div[@id="tabsCollection"]//div[@class="panel"]//div[@id="RRQASummaryBlock"]/div[@id="BVRRSummaryContainer"]'): jsurl = 'http://reviews.cabelas.com/8815/{0}/reviews.djs?format=embeddedhtml'.format(itemNO.split('-')[-1]) rating_content = self.fetch_page(jsurl) m = re.compile(r'<span class=\\"BVRRNumber BVRRRatingNumber\\">(.*?)<\\/span>').search(rating_content) if m: rating = float(m.group(1)) m = re.compile(r'<span class=\\"BVRRNumber BVRRBuyAgainTotal\\">(.*?)<\\/span>').search(rating_content) if m: review = float(m.group(1).replace(',', '')) model = [] models = node.xpath('./div[@class="layoutCenterColumn"]/div[@id="productChart"]//tbody/tr/td[1]/text()') for m in models: model.append(m) product = Product.objects(itemID=itemID).first() if not product: product = Product(itemID=itemID) product.full_update_time = timenow product.also_like = also_like product.image = img[0] if img else '' if price: product.price = price[0].replace('$', '').replace(',', '') product.itemNO = itemNO product.shipping = shipping if available: product.available = available[0] product.description = desc[0].text_content() if desc else '' if rating: product.rating = rating if review: product.review = review if model: product.model = model product.updated = True product.save()
def __init__(self, file_path): try: self.config = configparser.ConfigParser() self.config.read(file_path) except: log.log_traceback()
def get_info(self, url, catstr, item_num, page_num): """ """ content = self.fetch_page(url) tree = lxml.html.fromstring(content) try: iter_ret = tree.xpath('//div[@class="tMain"]//div[starts-with(@class, "productBlock clearfix ")]') except: log.log_traceback(self.logger_list, 'Error xpath or page: {0}'.format(url)) return timenow = datetime.utcnow() time_diff = timedelta(1) bahs = [] best_sell_ranks = [] images = [] urls = [] reviews = [] brands = [] titles = [] highlights = [] available = [] models = [] prices = [] shippings = [] if len(iter_ret) != item_num: log.log_traceback(self.logger_list, '{0} item_num: {1}, actual_num: {2}, page_num: {3}'.format(url, item_num, len(iter_ret), page_num) ) if len(iter_ret) == 0: time.sleep(30) return for j in xrange(len(iter_ret)): try: bah = iter_ret[j].xpath('.//div[@class="productBlockCenter"]/div[@class="points"]//li[1]/span[@class="value"]/text()')[0] except: log.log_traceback(self.logger_list, '!* {0} of {1} did not get b&h number {2}'.format(j+1, item_num, url)) log.log_print('content: {0}'.format(content), self.logger_list, logging.DEBUG) continue # product exist and without update less than 1 day, continue without update product = self.conn.get_product(bah) if product: if time_diff > (timenow - product['update_time']): continue bahs.append(bah) # This rank changes all the time.If some product updated,some not, same rank on two products will happen! best_sell_ranks.append( ITEM_PER_PAGE * (page_num-1) + j + 1 ) try: node = iter_ret[j].xpath('.//div[@class="productBlockLeft"]')[0] try: image = node.xpath('./a/img/@src')[0] if image: images.append('http://www.bhphotovideo.com' + image) else: images.append('') except: images.append('') log.log_traceback(self.logger_list) try: link = node.xpath('./a/@href')[0] if link: urls.append(link) else: urls.append('') except: urls.append('') log.log_traceback(self.logger_list) try: review = node.xpath('./div[@class="ratingBox"]/a[@class="info"]/text()') if review: m =re.compile('\d+').search(review[0]) reviews.append(m.group()) else: reviews.append('') except: reviews.append('') log.log_traceback(self.logger_list) except: log.log_traceback(self.logger_list) try: node = iter_ret[j].xpath('.//div[@class="productBlockCenter"]')[0] try: brand = node.xpath('./div[@class="clearfix"]/div[@class="brandTop"]/text()')[0] if brand: brands.append(brand) else: brands.append('') except: brands.append('') log.log_traceback(self.logger_list) try: title = node.xpath('./div[@id="productTitle"]//a/text()')[0] if title: titles.append(title) else: titles.append('') except: titles.append('') log.log_traceback(self.logger_list) try: desc = node.xpath('./ul/li') desc = [d.text_content() for d in desc] if desc: highlights.append(desc) else: highlights.append([]) except: highlights.append([]) log.log_traceback(self.logger_list) try: avail = node.xpath('.//div[@class="availability"]//text()') avail = [a.strip() for a in avail if not a.isspace()] if avail: available.append(avail) else: available.append([]) except: available.append([]) log.log_traceback(self.logger_list) except: log.log_traceback(self.logger_list) try: model = iter_ret[j].xpath('.//div[@class="productBlockCenter"]/div[@class="points"]//li[2]/span[@class="value"]/text()') if model: models.append(model[0]) else: models.append('') except: models.append('') log.log_traceback(self.logger_list) try: price = iter_ret[j].xpath('.//div[@id="productRight"]/ul[starts-with(@class, "priceList ")]/li[@class]/span[@class="value"]/text()') if price: price = price[0].replace(',', '').replace('$', '') else: price = iter_ret[j].xpath('.//div[@id="productRight"]/ul[@class="priceList "]/li[@class="map youPay"]/span[@class="value"]/text()') if price: price = price[0].strip().replace(',', '').replace('$', '') else: data_href = iter_ret[j].xpath('.//div[@id="productRight"]/ul[@class="priceList priceContainer"]/li[contains(@class, "cartLinkPrice")]/@data-href') if data_href: param0, param1 = ['string:{0}'.format(i) for i in data_href[0].split('_')] page = '/' + '/'.join(url.split('/')[3:]) cinum = url.split('/')[7] param3 = 'string:cat@__{0}@__type@__PrdLst'.format(cinum) param4 = 'string:' + cinum post_data = { 'c0-methodName': 'addToCart', 'c0-scriptName': 'DWRHelper', 'c0-id': '0', 'batchId': '10', 'callCount': '1', 'windowName': 'bhmain', 'page': page, 'httpSessionId': 'wwh9QYSPBd!-1310320805', 'scriptSessionId': '60F4DF55163FC3A41DF6C7B70D572C73', 'c0-param0': param0, 'c0-param1': param1, 'c0-param2': 'string:1', 'c0-param3': param3, 'c0-param4': param4 } jsurl = 'http://www.bhphotovideo.com/bnh/dwr/call/plaincall/DWRHelper.addToCart.dwr' sess = requests.session() resp_cont = sess.post(jsurl, data=post_data).content m = re.compile(r'<span class=\\"atcLayerPricePrice\\">(.*?)</span>').search(resp_cont) price = m.group(1).replace('\\n', '').replace(' ', '').replace(',', '').replace('$', '') if price: prices.append(price) else: prices.append('') except: prices.append('') log.log_traceback(self.logger_list) try: shipping = iter_ret[j].xpath('.//div[@id="productRight"]/ul[contains(@class, "priceList ")]/li[last()]/a/text()') if shipping: shippings.append(shipping[0]) else: shippings.append('') except: shippings.append('') log.log_traceback(self.logger_list) log.log_print('{0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10} {11} {12}'.format(len(bahs),len(best_sell_ranks),len(images),len(urls),len(reviews),len(brands),len(titles),len(highlights),len(available),len(models),len(prices),len(shippings),url), self.logger_list) update_now = datetime.utcnow() try: for i in xrange(len(bahs)): self.conn.update_listing(bahs[i], images[i], urls[i], reviews[i], brands[i], titles[i], highlights[i], available[i], models[i], prices[i], shippings[i], best_sell_ranks[i], catstr, update_now, detail_parse=False) except: log.log_traceback(self.logger_list, '{0} item of {1} items'.format(i, item_num))
def get_info(self, url, catstr, item_num, page_num, item_per_page): """ """ content = self.fetch_page(url) tree = lxml.html.fromstring(content) try: iter_ret = tree.xpath('//div[@id="rightcol"]//div[@id="listView"]')[0] # [<Element div at 0x1d8a7d0>] <Element div at 0x1d8a7d0> sku = [n.strip('\n') for n in iter_ret.xpath('.//div[@class="info-main"]/div[@class="attributes"]//strong[@class="sku"]/text()')] except: iter_ret = tree.xpath('//div[@id="container"]//div[@id="rightcol"]//div[@id="listView"]') if iter_ret: log.log_print('We need add container to xpath', self.logger_list, logging.ERROR) log.log_traceback(self.logger_list, 'Error when parse page: {0}'.format(url)) return timenow = datetime.utcnow() time_diff = timedelta(1) images = [] prices = [] titles = [] urls = [] manufacturers = [] models = [] description = [] rating = [] review = [] available = [] marketplace = [] for j in range(1, item_num + 1): # product exist and without update less than 1 day, continue without update # but sku's length is alway full(e.g. 15) # product = self.conn.get_product(sku[j-1]) # if product: # if time_diff > (timenow - product['update_time']): # continue try: node = iter_ret.xpath('.//div[@class="hproduct"][{0}]'.format(j))[0] except: log.log_traceback(self.logger_list, 'Product number[{0}] did not get'.format(item_num)) continue try: image = node.xpath('.//div[@class="image-col"]/a/img/@src') if image: images.append(image[0]) else: images.append('') except: images.append('') log.log_traceback(self.logger_list) try: price = node.xpath('.//div[@class="info-side"]//span[@itemprop="price"]/text()') if price: prices.append(price[0]) else: info_id = node.xpath('.//div[@class="info-side"]//a[contains(@href, "viewPrice")]/@href') if info_id: # javascript:bbyCartController.viewPrice('{skuId:2658068,productId:1218343212620}') info_id = info_id[0].split('{')[-1].split('}')[0].split(',') # ['skuId:2658068', 'productId:1218343212620'] info = [lid.split(':') for lid in info_id] price_url = 'http://www.bestbuy.com/site/olspage.jsp?{0}={1}&{2}={3}&id=pcat18005&type=page&renderMapCart=true'.format(info[0][0], info[0][1], info[1][0], info[1][1]) price_page = self.fetch_page(price_url) price_page_tree = lxml.html.fromstring(price_page) price_hide = price_page_tree.xpath('//div[@class="bby-price css-price bdt-price"]//span[@itemprop="price"]/text()') prices.append(price_hide[0]) else: prices.append('') except: prices.append('') log.log_traceback(self.logger_list) try: title = node.xpath('.//div[@class="info-main"]/h3[@itemprop="name"]/a') # //text() '\nEnergizer - Disney ', '<b>Cars</b>', ' LED Handheld Flashlight - Red/Black' if title: titles.append(title[0].text_content().lstrip('\n')) urls.append(title[0].get('href')) else: titles.append('') urls.append('') except: titles.append('') urls.append('') log.log_traceback(self.logger_list) try: manufacturer = node.xpath('.//div[@class="info-main"]/span[@itemprop="manufacturer"]/span/@content') if manufacturer: manufacturers.append(manufacturer[0]) else: manufacturers.append('') except: manufacturers.append('') log.log_traceback(self.logger_list) try: model = node.xpath('.//div[@class="info-main"]/div[@class="attributes"]//strong[@itemprop="model"]/text()') if model: models.append(model[0]) else: models.append('') except: models.append('') log.log_traceback(self.logger_list) try: desc = node.xpath('.//div[@class="info-main"]/div[@class="description"]') if desc: description.append(desc[0].text_content()) else: description.append('') except: description.append('') log.log_traceback(self.logger_list) try: rate = node.xpath('.//div[@class="info-main"]/div[@class="rating"]') if rate: # [u'\nCustomer Reviews:\n\xa0\nBe\nthe first to write a review.\n'] rate_ = rate[0].text_content().split('\n', 2)[-1].strip() r_ = rate_.split('\n') if r_[0] == u'Be': # u'Be\nthe first to write a review.' rating.append('') review.append('') else: # '3 of 5\n\n(2 reviews)' rating.append(r_[0]) review.append(r_[-1].lstrip('(').rstrip(')')) else: # combination, monitor and a host rating.append(None) review.append(None) except: rating.append('') review.append('') log.log_traceback(self.logger_list) try: avail = node.xpath('.//div[@class="info-main"]/div[@class="availHolder"]//div[@class="tooltip-contents"]/p/text()') # [' Usually leaves our warehouse in 1 business day ', '\n\n', u'\xa0\n\n\n\n\n\n\n', '\n'], [' Not available', '\n\n\n', u' Not Available\xa0\n', 'Find it at a Best Buy store.\n', '\n'], ['\n', ': Seller usually ships within 1-2 business days'], ['\n', ' Usually leaves our warehouse in 1 business day'], ['\n', ' You will schedule your delivery date in the next step.\n\n\n\n\n\n', '\n', '\n\n\n', u' Not Available\xa0\n'], ['\n', ' You will schedule your delivery date in the next step.\n\n\n\n\n\n', '\n', '\n\n\n', u' Not Available\xa0\n\n', 'Find it at a Best Buy store.', '\n'], [' ', 'Not Available for Shipping ', '\n\n\n', u'\xa0\n\n\n\n\n\n\n', '\n']] if not avail: avail = node.xpath('.//div[@class="info-main"]/div[@class="availHolder"]/a/span/text()') if not avail: available.append('') else: if avail[0] == '\n' or avail[0] == ' ': available.append(avail[1].split(':')[-1].strip()) else: available.append(avail[0].strip()) except: available.append('') log.log_traceback(self.logger_list) try: mrkpl = node.xpath('.//div[@class="info-main"]/div[@class="mrkpl"]//dd[@class="seller_info "]/a/text()') # [], ['\nBuy.com\n'] if mrkpl: marketplace.append(mrkpl[0].strip('\n')) else: marketplace.append('') except: marketplace.append('') log.log_traceback(self.logger_list) log.log_print('{0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10}'.format(len(images),len(prices),len(titles),len(urls),len(manufacturers),len(models),len(sku),len(description),len(rating),len(review),url), self.logger_list) update_now = datetime.utcnow() for i in xrange(item_num): try: best_sell = item_per_page * (page_num-1) + i + 1 self.conn.update_listing(sku[i], images[i], prices[i], titles[i], urls[i], manufacturers[i], models[i], description[i], rating[i], review[i], best_sell, catstr, update_now, detail_parse=False) except: log.log_traceback(self.logger_list, '{0} item of {1} items'.format(i, item_num))