Ejemplo n.º 1
0
    def parse_category(self, category_list_path, link, content):
        tree = lxml.html.fromstring(content)

        items = tree.xpath('//div[@class="narrowcontent"]//a[@class="headlink"]')
        if not items:
            try:
                total_num = tree.xpath('//div[@id="rightcol"]//div[@id="top-padbar"]/div/strong[2]')[0].text
                self.conn.set_leaf_category(category_list_path, int(total_num))
            except:
                log.log_traceback(self.logger_category, '!* Do not get how many items of this category: {0}'.format(link))

        for item in items:
            category_link = item.get('href')
            url = self.siteurl + category_link if category_link else ''

            category = item.text.strip('\n')
            # trap "Household Insulation":
            # http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A228013%2Cn%3A!468240%2Cn%3A551240%2Cn%3A495346&bbn=495346&ie=UTF8&qid=1344909516
            if category in category_list_path:
                continue

            cats = category_list_path + [category]
            log.log_print('category ==> {0}'.format(cats), self.logger_category)
            if self.conn.category_updated(' > '.join(cats)):
                continue

            if url:
                log.log_print('url ==> {0}'.format(url), self.logger_category)
            else:
                log.log_print('url ==> {0}'.format(url), self.logger_category, logging.ERROR)
            log.log_print('queue size ==> {0}'.format(self.queue.qsize()), self.logger_category)
            self.conn.insert_update_category(cats, url)
            self.queue.put((cats, url))
Ejemplo n.º 2
0
    def parse_category(self, category_list_path, url, content):
        tree = lxml.html.fromstring(content)
        node = tree.xpath('//div[@id="siteContent"]//div[@class="layoutLeftColumn"]//div[@class="leftnav_content"]')
        if node:
            node = node[0]
        else:
            log.log_traceback(self.logger_category, 'Url can not be parsed {0}'.format(url))
            return

        while node.xpath('./ul/li[@class="active"]'):
            node = node.xpath('./ul/li[@class="active"]')[0]
        items = node.xpath('./ul/li')
        if not items:
            showing = tree.xpath('//div[@id="siteContent"]//div[@class="layoutCenterColumn"]/div[@class="pagination"]//td[@class="showing"]/text()')[0]
            num = showing.split('of')[-1].split('total')[0].strip()
            cate = self.get_orm_category(url)
            cate.num = int(num)
            cate.is_leaf = True
            cate.save()
        else:
            for item in items:
                l = item.xpath('./a/@href')[0]
                link = l if l.startswith('http') else 'http://www.cabelas.com' + l
                category = item.xpath('./a/text()')[0]

                cate = self.get_orm_category(link)
                cate.cats = category_list_path + [category]
                cate.save()

                log.log_print('category ==> {0}'.format(cate.cats), self.logger_category)
                log.log_print('queue size ==> {0}'.format(self.queue.qsize()), self.logger_category)
                self.queue.put((cate.cats, link))
Ejemplo n.º 3
0
    def parse_listing(self, catstr, url, content):
        """ ITEM_PER_PAGE: 25 in setting.py """
        tree = lxml.html.fromstring(content)
        try:
            num = tree.xpath('//div[@class="tMain"]//div[@id="Plistbar"]/div[@id="PfirstRow"]/span[@id="PitemNum"]/text()')[0]
            total_num = int(num.replace('\t', '').strip().split('\n')[-1].split()[0])
        except:
            log.log_traceback(self.logger_list, '!* Do not get item numbers of category [{0}]: {1}'.format(catstr, url))
#            print content
            time.sleep(30)
            return

        log.log_print('{0} items in {1}'.format(total_num, url), self.logger_list)

        page_num = (total_num - 1) // ITEM_PER_PAGE + 1
        if page_num == 0:
            log.log_print('Listing page do not have any items! -- {0}'.format(url), self.logger_list, logging.ERROR)
        elif page_num == 1:
            self.get_info(url, catstr, total_num, page_num)
        else:
            part1 = '/'.join( url.split('/')[:-2] )
            part2 = '/'.join( url.split('/')[-2:] )
            for i in xrange(1, page_num):
                self.get_info(part1 + '/pn/{0}/'.format(i) + part2, catstr, ITEM_PER_PAGE, i)
                time.sleep(0.5)
            self.get_info(part1 + '/pn/{0}/'.format(page_num) + part2, catstr, total_num % ITEM_PER_PAGE, page_num)
        time.sleep(0.5)
Ejemplo n.º 4
0
    def parse_product(self, sku, url, content):
        tree = lxml.html.fromstring(content.decode('utf-8','replace'))
        try:
            node = tree.xpath('//div[@id="content"]/div[@id="pdpcenterwell"]')[0]
        except:
            log.log_traceback(self.logger_product, 'Product have problem when parsing: {0}'.format(url))
            return

        #item = node.xpath('.//div[@id="productsummary"]/div[@id="financing"]//li/a/text()') # will add a \n to the tail of every field
        offer= node.xpath('.//div[@id="productsummary"]/div[@id="financing"]//li/a')
        #['\n18-Month Financing', '\nGet 4% Back in Rewards: See How']
        if offer:
            offers = [a.text_content().strip() for a in offer]
        else:
            offers = []

        specifications = []
        try:
            specifications = node.xpath('.//div[@id="productdetail"]/div[@id="pdptabs"]/div[@id="tabbed-specifications"]//li/div//text()')
#            if specification:
#
#                spec = []
#                for a in specification:
#                    if 'Customer Reviews' in a:
#                        break
#                    if a != '\n':
#                        spec.append( a.strip('\n') )
#
#                length = len(spec)
#                i = 0 
#                key = ''
#                while i < length:
#                    if i + 2 < length and spec[i+2] == ' ':
#                        # mulit value pair: ['software include', 'vim', ' ', 'emacs', ' ', 'process', 'intel']
#                        if key:
#                            specifications[key].append(spec[i+1])
#                        else:
#                            specifications[ spec[i] ] = [ spec[i+1] ]
#                            key = spec[i]
#                        # print key, specifications[key]
#                    else:
#                        # normal pair: ['cpu', 'amd', 'brand', 'dell']
#                        key = ''
#                        if spec[i] is ' ' or spec[i] is '':
#                            # ['Estimated Yearly Operating Cost', '$17', '', 'UPC', '600603146435', ' ', '']
#                            i += 1
#                            if i >= length: # last item is ' '
#                                break
#                            continue
#                        else:
#                            specifications[ spec[i] ] = spec[i+1]
#                            # print spec[i], specifications[ spec[i] ]
#                    i += 2
        except:
            log.log_traceback(self.logger_product, 'Product specifications parsing problem: {0}'.format(url))

        self.conn.update_product(sku, offers, specifications)
Ejemplo n.º 5
0
 def cycle_crawl_category(self, timeover=60):
     while not self.queue.empty():
         try:
             job = self.queue.get(timeout=timeover)
             utf8_content = self.fetch_page(job[1])
             self.parse_category(job[0], job[1], utf8_content)
         except Queue.Empty:
             log.log_traceback(self.logger_category, 'Queue waiting {0} seconds without response!'.format(timeover))
         except:
             log.log_traceback(self.logger_category)
Ejemplo n.º 6
0
    def parse_product(self, bah, url, content):
        tree = lxml.html.fromstring(content)
        try:
            node = tree.xpath('//div[@class="tMain"]//div[@id="productAllWrapper"]/div[@id="productMainWrapper"]')
            if node: node = node[0]
            else: return
        except:
            log.log_traceback(self.logger_product, 'Product have problem when parsing: {0}'.format(url))

        bill_later = node.xpath('.//div[@id="productRight"]//div[contains(@class, "altPayment findLast")]//li/a//text()')
        bill_later = [a.strip() for a in bill_later]

        info = []
        buy_together = tree.xpath('.//div[@class="productInfoArea adm findLast"]/a/@href')
        if buy_together:
            content = self.fetch_page(buy_together[0])
            intree = lxml.html.fromstring(content)
            ones = intree.xpath('//div[@class="ui-dialog-content"]//div[@class="col titleDetails"]')
            for one in ones:
                title = one.xpath('./div[@class="title"]/span//text()')
                info = [t.strip() for t in title if t.strip()]
                model = one.xpath('./div[@class="details"]/p/text()')
                info.append(model[0])
        buy_together = info

        specifications = {}
        tables = node.xpath('.//div[@id="bottomWrapper"]//div[@id="Specification"]//table[@class="specTable"]')
        for table in tables:
            key = table.xpath('.//tr/td[@class="specTopic"]')
            value = table.xpath('.//tr/td[@class="specDetail"]')
            k = [k.text_content().strip() for k in key]
            v = [v.text_content().strip().replace('\n', '') for v in value]
            specifications.update( dict(zip(k, v)) )

        in_box = node.xpath('.//div[@id="bottomWrapper"]//div[@id="WhatsInTheBox"]/ul/li')
        in_box = [a.text_content().strip() for a in in_box]

        rating = node.xpath('.//div[@id="bottomWrapper"]//div[@id="costumerReview"]//div[@class="pr-snapshot-rating rating"]/span/text()')
        if rating:
            rating = rating[0]
        else:
            rating = ''

#        items = node.xpath('.//div[@id="bottomWrapper"]//div[@class="accGroup "]//form[@class="addToCartForm"]//div[@class="accDetails"]')
#        for item in items:
#            title = item.xpath('./div[1]')
#            title[0].text_content()
#            model = item.xpath('./div[@class="ItemNum"]/span')
#            model[0].text_content()



        self.conn.update_product(bah, bill_later, specifications, in_box, rating, buy_together)
        time.sleep(1)
Ejemplo n.º 7
0
    def parse_listing(self, url, catstr, tree, page_num, num_in_this_url):
        """ ITEM_PER_PAGE: 48 in settings.py """
        try:
            nodes = tree.xpath('//div[@id="siteContent"]//div[@class="layoutCenterColumn"]/div[@class="itemsWrapper"]/div[@class="resultsColumn"]//div[@class="itemEntryInner"]')
        except:
            log.log_traceback(self.logger_list, 'Did not parse node: {0}'.format(url))
#            log.log_print('content: {0}'.format(content), self.logger_list, logging.DEBUG)
            return
        if len(nodes) != num_in_this_url:
            log.log_traceback(self.logger_list, '{0} num_in_this_url: {1}, actual_num: {2}'.format(url, num_in_this_url, len(nodes)) )

        timenow = datetime.utcnow()

        for j in xrange(len(nodes)):
            price = nodes[j].xpath('.//div[@class="price"]/div/div[@class="textSale"]/text()')
            if not price:
                price = nodes[j].xpath('.//div[@class="price"]/div/div/text()')
            if not price:
                price = ''
                log.log_traceback(self.logger_list, 'do not get price {0} {1}'.format(url, j))
            else:
                price = price[0]

            t = nodes[j].xpath('.//id/a[@class="itemName"]')[0]
            title = t.text_content()
            if not title:
                title = ''
                log.log_traceback(self.logger_list, 'do not get title {0} {1}'.format(url, j))
            l = t.get('href')
            if not l:
                link = ''
                log.log_traceback(self.logger_list, 'do not get link {0} {1}'.format(url, j))
            else:
                link = l if l.startswith('http') else 'http://www.cabelas.com' + l

            # This rank changes all the time.If some product updated,some not, same rank on two products will happen!
            sell_rank = ITEM_PER_PAGE * (page_num-1) + j + 1

            itemID = re.compile(r'.*/(\d+).uts?.*').match(link).group(1)
            product = Product.objects(itemID=itemID).first()
            if not product:
                product = Product(itemID=itemID)

            product.title = title
            product.sell_rank = sell_rank
            product.list_update_time = timenow
            product.price = price.replace('$', '').replace(',', '')
            product.updated = False
            if product.catstrs == []:
                product.catstrs.append(catstr)
            elif catstr not in product.catstrs:
                product.catstrs.append(catstr)
            product.save()
Ejemplo n.º 8
0
def load_data():
    """
    Load all table data into the search index.
    """
    try:
        loader.download()
        load_table_data()
        status = 'loaded'
    except Exception as ex:
        log.log_traceback(ex)
        status = 'failed'
    return flask.jsonify({'status': status})
Ejemplo n.º 9
0
    def parse_category(self, category_list_path, url, content):
        tree = lxml.html.fromstring(content)

        items = tree.xpath('//div[@class="tMain"]//div[@class="column"]//li/a')
        if not items:
            # second kind of category page
            items = tree.xpath('//div[@class="tMain"]//table[@class="catColumn"]//tr[@valign="top"]//a')

        if items:
            links = [item.get('href') for item in items]
            names = [item.text_content() for item in items]
        else:
            # third kind of category page
            items = tree.xpath('//div[@id="mainContent"]//div[@class="categoryGroup staticBody"]/div/a')
            if items:
                links = [item.get('href') for item in items]
                names = [item.xpath('.//img')[0].get('alt') for item in items]
            else:
                links, names = [], []

                try:
                    # listing page
                    num = tree.xpath('//div[@class="tMain"]//div[@id="Plistbar"]/div[@id="PfirstRow"]/span[@id="PitemNum"]/text()')[0]
                    total_num = int(num.replace('\t', '').strip().split('\n')[-1].split()[0])

                    self.conn.set_leaf_category(category_list_path, total_num)
                except:
                    log.log_traceback( self.logger_category, '!! {0} neither a category nor a listing page.Or other errors.'.format(url) )
                    log.log_print('content: {0}'.format(content), self.logger_category, logging.DEBUG)
                    return

        pairs = []
        if len(links) != len(names):
            log.log_traceback(self.logger_category, '!! links num: {0}; names num: {1}. {2}'.format(len(links), len(names), url))
            return
        else:
            pairs = zip(names, links)

        for category, link in pairs:

            # trap "Household Insulation":
            # http://www.amazon.com/s/ref=sr_ex_n_1?rh=n%3A228013%2Cn%3A!468240%2Cn%3A551240%2Cn%3A495346&bbn=495346&ie=UTF8&qid=1344909516
            if category_list_path[-1] == category:
                continue

            cats = category_list_path + [category]
            log.log_print('category ==> {0}'.format(cats), self.logger_category)
            if self.conn.category_updated(' > '.join(cats)):
                continue

            log.log_print('queue size ==> {0}'.format(self.queue.qsize()), self.logger_category)
            self.conn.insert_update_category(cats, link)
            self.queue.put((cats, link))
 def dump_file(self, variable, directory, file_name):
     ''' dump variable to file. '''
     full_name = os.path.join(directory, file_name)
     if os.path.isfile(full_name):
         os.rename(full_name, full_name + '_bak')
     with open(full_name, 'wb') as f:
         try:
             pickle.dump(variable, f)
         except:
             log.log_traceback(msg='dump to ' + full_name + ' error:')
             return False
     return True
 def run(self):
     ''' the get-and-do loop'''
     while True:
         try:
             callable, args, kwargs = self.work_queue.get(timeout=self.timeout)
             result = callable(*args, **kwargs)
             if result: self.result_queue.put(result) # only add url_graph result, filter indexing result
             # print('worker[%d]: %s' % (self.id, str(result)))
             # formally need task_done(), but we can omit it here.
             self.work_queue.task_done()
         except queue.Empty:
             break
         except:
             message = 'worker[{0}]'.format(self.id)
             log.log_traceback(msg=message)
 def load_file(self, directory, file_name):
     ''' load dump file to variables. '''
     full_name = os.path.join(directory, file_name)
     open_file = full_name + '_bak'
     if not os.path.isfile(open_file):
         if not os.path.isfile(full_name):
             print('No dump file exist in ', directory, file_name)
             return False
         else:
             open_file = full_name
     with open(open_file, 'rb') as f:
         try:
             variable = pickle.load(f)
         except:
             log.log_traceback(msg='load dump file from ' + open_file + ' error:')
             return False
     return variable
Ejemplo n.º 13
0
def check_dir(directory):
    ''' Check whether directory exist.
        If so, make sure it is not a file.
        If not, create one.
    '''

    if os.path.exists(directory):
        if not os.path.isdir(directory):
            print(directory, 'is not a directory!')
            return False
    else:
        try:
            os.makedirs(directory)
        except OSError:
            log.log_traceback(msg=directory)
            return False
    return True
Ejemplo n.º 14
0
    def get(self, url):
        try:
            http = httplib2.Http(timeout=10)
            response, content = http.request(url, 'GET')
            if response['status'] != '200': return
            if content == b'Access Denied': return

            encode = ''
            for item in response['content-type'].lower().split(';'):
                if 'charset' in item:
                    encode = item.split('=')[1]
            if not encode:
                re_sh = self.pat.search(content)
                if re_sh:
                    encode = re_sh.group(3).decode()
            if not encode: encode = 'utf-8'
            return content.decode(encode, 'ignore')
        except UnicodeDecodeError:
            self.logger.debug('get(): ', url, response)
            log.log_traceback(self.logger)
        except:
            log.log_traceback(self.logger)
Ejemplo n.º 15
0
    def parse_listing(self, catstr, url, content):
        """ parse listing page to get each product -> db """
        item_per_page = 15
        tree = lxml.html.fromstring(content)
        try:
            total_num = tree.xpath('//div[@id="rightcol"]//div[@id="top-padbar"]/div/strong[2]')[0].text
        except:
            log.log_traceback(self.logger_list, '!* Do not get how many items of this category: {0}'.format(url))
            return

        num = int(total_num)
        log.log_print('{0} items in {1}'.format(num, url), self.logger_list)

        page_num = (num - 1) // item_per_page + 1
        if page_num == 0:
            log.log_print('Listing page do not have any items! -- {0}'.format(url), self.logger_list, logging.ERROR)
        elif page_num == 1:
            self.get_info(url, catstr, num, page_num, item_per_page)
        else:
            for i in xrange(1, page_num):
                self.get_info('{0}&gf=y&cp={1}'.format(url, i), catstr, item_per_page, i, item_per_page)
            self.get_info('{0}&gf=y&cp={1}'.format(url, page_num), catstr, num % item_per_page, page_num, item_per_page)
    pstem = porter_stemming.PorterStemmer()
    # cache
    cache_stem = cache.cache()
    cache_search = cache.cache()

    # for search cycle
    while True:
        cache_stem.pop_onethird()
        cache_search.pop_onethird()
        try:
            words = input('Please input words to search: ')
        except EOFError:
            print('Ctrl-d, program exit.')
            exit(0)
        except:
            log.log_traceback(logger)
            exit(0)
        logger.info('search: %s', words)
        if cache_stem.have_key(str(words)):
            query = cache_stem.get_by_key(str(words))
        else:
            query = pstem.controling(words) # a words list
            cache_stem.add_kv(str(words), query)
        if cache_search.have_key(str(query)): # unhashable type: list
            result = cache_search.get_by_key(str(query))
        else:
            result = search.multi_search(index, ranks, query)
            cache_search.add_kv(str(query), result)

        if not result:
            print('Sorry, the engine can not find what you want.')
Ejemplo n.º 17
0
    def parse_product(self, url, content, itemID):
        tree = lxml.html.fromstring(content)
        try:
            node = tree.xpath('//div[@id="siteContent"]//div[@id="productDetailsTemplate"]/div[@class="layoutWithRightColumn"]')[0]
        except:
            log.log_traceback(self.logger_product, 'Parsing page problem: {0}'.format(url))
            return

        timenow = datetime.utcnow()

        also_like = []
        like = node.xpath('./div[@class="layoutRightColumn"]/div[@class="youMayAlsoLike"]//div[@class="item"]//a[@class="itemName"]')
        for l in like:
            link = l.get('href') if l.get('href').startswith('http') else 'http://www.cabelas.com' + l.get('href')
            also_like.append( (l.text_content(), link) )

#        img = node.xpath('./div[@class="layoutCenterColumn"]/div[@class="js-itemImageViewer itemImageInclude"]/img/@src')
        img = tree.xpath('/html/head/meta[@property="og:image"]/@content')
        if not img:
            log.log_traceback(self.logger_product, 'Page donot have a image: {0}'.format(url))

        info = node.xpath('./div[@class="layoutCenterColumn"]/div[@id="productInfo"]')
        if not info:
            log.log_traceback(self.logger_product, 'Page donot have a info: {0}'.format(url))
            return
        else:
            info = info[0]

        available = info.xpath('.//div[@class="variantConfigurator"]//div[@class="stockMessage"]/span/text()')
        if not available:
            if info.xpath('.//div[@class="variantConfigurator"]//div[@class="js-availabilityMessage"]'):
                m = re.compile(r"ddWidgetEntries\['js-vc13280170'] =(.*), values ").search(content)
                # http://www.cabelas.com/product/746407.uts
                if m:
                    jsid = m.group(1).split(':')[-1].strip()
                    post_data = { 
                        'productVariantId': jsid,
                    }   
                    jsurl = 'http://www.cabelas.com/catalog/includes/availabilityMessage_include.jsp'
                    sess = requests.Session()
                    resp_cont = sess.post(jsurl, data=post_data).content
                    available = re.compile(r'<span class="availabilityMessage">(.*)</span>').search(resp_cont).group(1)

        price = info.xpath('.//div[@class="price"]/dl[@class="salePrice"]/dd[1]/text()')
        if not price:
            price = info.xpath('.//div[@class="price"]/dl[1]/dd[1]/text()')
        if not price:
            avail = info.xpath('.//div[@class="variantConfigurator"]/span[@class="soldOut"]/text()')
            if avail == ['Sold Out']:
                available = 'Sold Out'
                log.log_print('Page donot have a price: {0}'.format(url), self.logger_product, logging.WARNING)

        itemNO = info.xpath('.//div[@class="variantConfigurator"]//span[@class="itemNumber"]/text()') # this xpath need strip()
        if not itemNO:
            itemNO = tree.xpath('//div[@id="siteContent"]//div[@class="w100"]/meta[1]/@content')
        if not itemNO:
            log.log_traceback(self.logger_product, 'Page donot have a itemNO: {0}'.format(url))
        else:
            itemNO = itemNO[0].strip()

        ship = info.xpath('.//div[@class="bottomNote"]//td/img/@alt')
        if ship and ship[0] == 'In-Store Pick Up':
            shipping = 'free shipping'
        else:
            shipping = ''

        desc = node.xpath('./div[@class="layoutCenterColumn"]/div[@id="tabsCollection"]//div[@id="description"]')

        rating, review = '', ''
        if node.xpath('./div[@class="layoutCenterColumn"]/div[@id="tabsCollection"]//div[@class="panel"]//div[@id="RRQASummaryBlock"]/div[@id="BVRRSummaryContainer"]'):
            jsurl = 'http://reviews.cabelas.com/8815/{0}/reviews.djs?format=embeddedhtml'.format(itemNO.split('-')[-1])
            rating_content = self.fetch_page(jsurl)
            m = re.compile(r'<span class=\\"BVRRNumber BVRRRatingNumber\\">(.*?)<\\/span>').search(rating_content)
            if m:
                rating = float(m.group(1))
            m = re.compile(r'<span class=\\"BVRRNumber BVRRBuyAgainTotal\\">(.*?)<\\/span>').search(rating_content)
            if m:
                review = float(m.group(1).replace(',', ''))

        model = []
        models = node.xpath('./div[@class="layoutCenterColumn"]/div[@id="productChart"]//tbody/tr/td[1]/text()')
        for m in models:
            model.append(m)
            

        product = Product.objects(itemID=itemID).first()
        if not product:
            product = Product(itemID=itemID)

        product.full_update_time = timenow
        product.also_like = also_like
        product.image = img[0] if img else ''
        if price:
            product.price = price[0].replace('$', '').replace(',', '')
        product.itemNO = itemNO
        product.shipping = shipping
        if available:
            product.available = available[0]
        product.description = desc[0].text_content() if desc else ''
        if rating:
            product.rating = rating
        if review:
            product.review = review
        if model:
            product.model = model
        product.updated = True

        product.save()
Ejemplo n.º 18
0
 def __init__(self, file_path):
     try:
         self.config = configparser.ConfigParser()
         self.config.read(file_path)
     except:
         log.log_traceback()
Ejemplo n.º 19
0
    def get_info(self, url, catstr, item_num, page_num):
        """  """
        content = self.fetch_page(url)
        tree = lxml.html.fromstring(content)
        try:
            iter_ret = tree.xpath('//div[@class="tMain"]//div[starts-with(@class, "productBlock clearfix ")]')
        except:
            log.log_traceback(self.logger_list, 'Error xpath or page: {0}'.format(url))
            return

        timenow = datetime.utcnow()
        time_diff = timedelta(1)

        bahs = []
        best_sell_ranks = []
        images = []
        urls = []
        reviews = []
        brands = []
        titles = []
        highlights = []
        available = []
        models = []
        prices = []
        shippings = []

        if len(iter_ret) != item_num:
            log.log_traceback(self.logger_list, '{0} item_num: {1}, actual_num: {2}, page_num: {3}'.format(url, item_num, len(iter_ret), page_num) )
            if len(iter_ret) == 0:
                time.sleep(30)
                return

        for j in xrange(len(iter_ret)):

            try:
                bah = iter_ret[j].xpath('.//div[@class="productBlockCenter"]/div[@class="points"]//li[1]/span[@class="value"]/text()')[0]
            except:
                log.log_traceback(self.logger_list, '!* {0} of {1} did not get b&h number {2}'.format(j+1, item_num, url))
                log.log_print('content: {0}'.format(content), self.logger_list, logging.DEBUG)
                continue

            # product exist and without update less than 1 day, continue without update
            product = self.conn.get_product(bah)
            if product:
                if time_diff > (timenow - product['update_time']):
                    continue

            bahs.append(bah)
            # This rank changes all the time.If some product updated,some not, same rank on two products will happen!
            best_sell_ranks.append( ITEM_PER_PAGE * (page_num-1) + j + 1 )

            try:
                node = iter_ret[j].xpath('.//div[@class="productBlockLeft"]')[0]
                try:
                    image = node.xpath('./a/img/@src')[0]
                    if image:
                        images.append('http://www.bhphotovideo.com' + image)
                    else:
                        images.append('')
                except:
                    images.append('')
                    log.log_traceback(self.logger_list)

                try:
                    link = node.xpath('./a/@href')[0]
                    if link:
                        urls.append(link)
                    else:
                        urls.append('')
                except:
                    urls.append('')
                    log.log_traceback(self.logger_list)

                try:
                    review = node.xpath('./div[@class="ratingBox"]/a[@class="info"]/text()')
                    if review:
                        m =re.compile('\d+').search(review[0])
                        reviews.append(m.group())
                    else:
                        reviews.append('')
                except:
                    reviews.append('')
                    log.log_traceback(self.logger_list)
            except:
                log.log_traceback(self.logger_list)


            try:
                node = iter_ret[j].xpath('.//div[@class="productBlockCenter"]')[0]
                try:
                    brand = node.xpath('./div[@class="clearfix"]/div[@class="brandTop"]/text()')[0]
                    if brand:
                        brands.append(brand)
                    else:
                        brands.append('')
                except:
                    brands.append('')
                    log.log_traceback(self.logger_list)

                try:
                    title = node.xpath('./div[@id="productTitle"]//a/text()')[0]
                    if title:
                        titles.append(title)
                    else:
                        titles.append('')
                except:
                    titles.append('')
                    log.log_traceback(self.logger_list)

                try:
                    desc = node.xpath('./ul/li')
                    desc = [d.text_content() for d in desc]
                    if desc:
                        highlights.append(desc)
                    else:
                        highlights.append([])
                except:
                    highlights.append([])
                    log.log_traceback(self.logger_list)

                try:
                    avail = node.xpath('.//div[@class="availability"]//text()')
                    avail = [a.strip() for a in avail if not a.isspace()]
                    if avail:
                        available.append(avail)
                    else:
                        available.append([])
                except:
                    available.append([])
                    log.log_traceback(self.logger_list)

            except:
                log.log_traceback(self.logger_list)

            try:
                model = iter_ret[j].xpath('.//div[@class="productBlockCenter"]/div[@class="points"]//li[2]/span[@class="value"]/text()')
                if model:
                    models.append(model[0])
                else:
                    models.append('')
            except:
                models.append('')
                log.log_traceback(self.logger_list)

            try:
                price = iter_ret[j].xpath('.//div[@id="productRight"]/ul[starts-with(@class, "priceList ")]/li[@class]/span[@class="value"]/text()')
                if price:
                    price = price[0].replace(',', '').replace('$', '') 
                else:
                    price = iter_ret[j].xpath('.//div[@id="productRight"]/ul[@class="priceList "]/li[@class="map youPay"]/span[@class="value"]/text()')
                    if price:
                        price = price[0].strip().replace(',', '').replace('$', '')
                    else:
                        data_href = iter_ret[j].xpath('.//div[@id="productRight"]/ul[@class="priceList priceContainer"]/li[contains(@class, "cartLinkPrice")]/@data-href')
                        if data_href:
                            param0, param1 = ['string:{0}'.format(i) for i in data_href[0].split('_')]
                            page = '/' + '/'.join(url.split('/')[3:])
                            cinum = url.split('/')[7]
                            param3 = 'string:cat@__{0}@__type@__PrdLst'.format(cinum)
                            param4 = 'string:' + cinum
                            post_data = { 
                                'c0-methodName': 'addToCart',
                                'c0-scriptName': 'DWRHelper',
                                'c0-id': '0',
                                'batchId': '10',
                                'callCount': '1',
                                'windowName': 'bhmain',
                                'page': page,
                                'httpSessionId': 'wwh9QYSPBd!-1310320805',
                                'scriptSessionId': '60F4DF55163FC3A41DF6C7B70D572C73',
                                'c0-param0': param0,
                                'c0-param1': param1,
                                'c0-param2': 'string:1',
                                'c0-param3': param3,
                                'c0-param4': param4
                            }       
                            jsurl = 'http://www.bhphotovideo.com/bnh/dwr/call/plaincall/DWRHelper.addToCart.dwr'
                            sess = requests.session()
                            resp_cont = sess.post(jsurl, data=post_data).content
                            m = re.compile(r'<span class=\\"atcLayerPricePrice\\">(.*?)</span>').search(resp_cont)
                            price = m.group(1).replace('\\n', '').replace(' ', '').replace(',', '').replace('$', '')

                if price:
                    prices.append(price)
                else:   
                    prices.append('')
            except:
                prices.append('')
                log.log_traceback(self.logger_list)


            try:
                shipping = iter_ret[j].xpath('.//div[@id="productRight"]/ul[contains(@class, "priceList ")]/li[last()]/a/text()')
                if shipping:
                    shippings.append(shipping[0])
                else:
                    shippings.append('')
            except:
                shippings.append('')
                log.log_traceback(self.logger_list)


        log.log_print('{0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10} {11} {12}'.format(len(bahs),len(best_sell_ranks),len(images),len(urls),len(reviews),len(brands),len(titles),len(highlights),len(available),len(models),len(prices),len(shippings),url), self.logger_list)
        update_now = datetime.utcnow()
        try:
            for i in xrange(len(bahs)):
                self.conn.update_listing(bahs[i], images[i], urls[i], reviews[i], brands[i], titles[i], highlights[i], available[i], models[i], prices[i], shippings[i], best_sell_ranks[i], catstr, update_now, detail_parse=False)
        except:
            log.log_traceback(self.logger_list, '{0} item of {1} items'.format(i, item_num))
Ejemplo n.º 20
0
    def get_info(self, url, catstr, item_num, page_num, item_per_page):
        
        """  """
        content = self.fetch_page(url)
        tree = lxml.html.fromstring(content)
        try:
            iter_ret = tree.xpath('//div[@id="rightcol"]//div[@id="listView"]')[0]
            # [<Element div at 0x1d8a7d0>] <Element div at 0x1d8a7d0>

            sku = [n.strip('\n') for n in iter_ret.xpath('.//div[@class="info-main"]/div[@class="attributes"]//strong[@class="sku"]/text()')]
        except:
            iter_ret = tree.xpath('//div[@id="container"]//div[@id="rightcol"]//div[@id="listView"]')
            if iter_ret:
                log.log_print('We need add container to xpath', self.logger_list, logging.ERROR)
            log.log_traceback(self.logger_list, 'Error when parse page: {0}'.format(url))
            return

        timenow = datetime.utcnow()
        time_diff = timedelta(1)
        images = []
        prices = []
        titles = []
        urls = []
        manufacturers = []
        models = []
        description = []
        rating = []
        review = []
        available = []
        marketplace = []
        for j in range(1, item_num + 1):
            # product exist and without update less than 1 day, continue without update
            # but sku's length is alway full(e.g. 15)
#            product = self.conn.get_product(sku[j-1])
#            if product:
#                if time_diff > (timenow - product['update_time']):
#                    continue

            try:
                node = iter_ret.xpath('.//div[@class="hproduct"][{0}]'.format(j))[0]
            except:
                log.log_traceback(self.logger_list, 'Product number[{0}] did not get'.format(item_num))
                continue

            try:
                image = node.xpath('.//div[@class="image-col"]/a/img/@src')
                if image:
                    images.append(image[0])
                else:
                    images.append('')
            except:
                images.append('')
                log.log_traceback(self.logger_list)

            try:
                price = node.xpath('.//div[@class="info-side"]//span[@itemprop="price"]/text()')
                if price:
                    prices.append(price[0])
                else:
                    info_id = node.xpath('.//div[@class="info-side"]//a[contains(@href, "viewPrice")]/@href')
                    if info_id:
                        # javascript:bbyCartController.viewPrice('{skuId:2658068,productId:1218343212620}')
                        info_id = info_id[0].split('{')[-1].split('}')[0].split(',') # ['skuId:2658068', 'productId:1218343212620']
                        info = [lid.split(':') for lid in info_id]
                        price_url = 'http://www.bestbuy.com/site/olspage.jsp?{0}={1}&{2}={3}&id=pcat18005&type=page&renderMapCart=true'.format(info[0][0], info[0][1], info[1][0], info[1][1])

                        price_page = self.fetch_page(price_url)
                        price_page_tree = lxml.html.fromstring(price_page)
                        price_hide = price_page_tree.xpath('//div[@class="bby-price css-price bdt-price"]//span[@itemprop="price"]/text()')
                        prices.append(price_hide[0])
                    else:
                        prices.append('')
            except:
                prices.append('')
                log.log_traceback(self.logger_list)

            try:
                title = node.xpath('.//div[@class="info-main"]/h3[@itemprop="name"]/a')
                # //text() '\nEnergizer - Disney ', '<b>Cars</b>', ' LED Handheld Flashlight - Red/Black'
                if title:
                    titles.append(title[0].text_content().lstrip('\n'))
                    urls.append(title[0].get('href'))
                else:
                    titles.append('')
                    urls.append('')
            except:
                titles.append('')
                urls.append('')
                log.log_traceback(self.logger_list)

            try:
                manufacturer = node.xpath('.//div[@class="info-main"]/span[@itemprop="manufacturer"]/span/@content')
                if manufacturer:
                    manufacturers.append(manufacturer[0])
                else:
                    manufacturers.append('')
            except:
                manufacturers.append('')
                log.log_traceback(self.logger_list)

            try:
                model = node.xpath('.//div[@class="info-main"]/div[@class="attributes"]//strong[@itemprop="model"]/text()')
                if model:
                    models.append(model[0])
                else:
                    models.append('')
            except:
                models.append('')
                log.log_traceback(self.logger_list)

            try:
                desc = node.xpath('.//div[@class="info-main"]/div[@class="description"]')
                if desc:
                    description.append(desc[0].text_content())
                else:
                    description.append('')
            except:
                description.append('')
                log.log_traceback(self.logger_list)

            try:
                rate = node.xpath('.//div[@class="info-main"]/div[@class="rating"]')
                if rate:
                    # [u'\nCustomer Reviews:\n\xa0\nBe\nthe first to write a review.\n']
                    rate_ = rate[0].text_content().split('\n', 2)[-1].strip()
                    r_ = rate_.split('\n')
                    if r_[0] == u'Be':
                        # u'Be\nthe first to write a review.'
                        rating.append('')
                        review.append('')
                    else:
                        # '3  of 5\n\n(2 reviews)'
                        rating.append(r_[0])
                        review.append(r_[-1].lstrip('(').rstrip(')'))
                else:
                    # combination, monitor and a host
                    rating.append(None)
                    review.append(None)
            except:
                rating.append('')
                review.append('')
                log.log_traceback(self.logger_list)

            try:
                avail = node.xpath('.//div[@class="info-main"]/div[@class="availHolder"]//div[@class="tooltip-contents"]/p/text()')
                # ['  Usually leaves our warehouse in 1 business day ', '\n\n', u'\xa0\n\n\n\n\n\n\n', '\n'], [' Not available', '\n\n\n', u' Not Available\xa0\n', 'Find it at a  Best Buy store.\n', '\n'], ['\n', ': Seller usually ships within 1-2 business days'], ['\n', '  Usually leaves our warehouse in 1 business day'], ['\n', ' You will schedule your delivery date in the next step.\n\n\n\n\n\n', '\n', '\n\n\n', u' Not Available\xa0\n'], ['\n', ' You will schedule your delivery date in the next step.\n\n\n\n\n\n', '\n', '\n\n\n', u' Not Available\xa0\n\n', 'Find it at a  Best Buy store.', '\n'], [' ', 'Not Available for Shipping ', '\n\n\n', u'\xa0\n\n\n\n\n\n\n', '\n']]
                if not avail:
                    avail = node.xpath('.//div[@class="info-main"]/div[@class="availHolder"]/a/span/text()')
                if not avail:
                    available.append('')
                else:
                    if avail[0] == '\n' or avail[0] == ' ':
                        available.append(avail[1].split(':')[-1].strip())
                    else:
                        available.append(avail[0].strip())
            except:
                available.append('')
                log.log_traceback(self.logger_list)

            try:
                mrkpl = node.xpath('.//div[@class="info-main"]/div[@class="mrkpl"]//dd[@class="seller_info "]/a/text()')
                # [], ['\nBuy.com\n']
                if mrkpl:
                    marketplace.append(mrkpl[0].strip('\n'))
                else:
                    marketplace.append('')
            except:
                marketplace.append('')
                log.log_traceback(self.logger_list)

        log.log_print('{0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10}'.format(len(images),len(prices),len(titles),len(urls),len(manufacturers),len(models),len(sku),len(description),len(rating),len(review),url), self.logger_list)
        update_now = datetime.utcnow()
        for i in xrange(item_num):
            try:
                best_sell = item_per_page * (page_num-1) + i + 1
                self.conn.update_listing(sku[i], images[i], prices[i], titles[i], urls[i], manufacturers[i], models[i], description[i], rating[i], review[i], best_sell, catstr, update_now, detail_parse=False)
            except:
                log.log_traceback(self.logger_list, '{0} item of {1} items'.format(i, item_num))