Python encoding Examples, jd_utils.encoding Python Examples

Example #1

0

Show file

File: jd_logger.py Project: liulangjita/dust_repos

 def log(self, cls_name, method_name, message, type):
     """
     Instantiate the logger obj and log to file depending on the msg type.
     """
     logger = logging.getLogger("%s - %s:" % (cls_name, method_name))
     
     # print the message out if the logger is set to be verbose
     if self.verbose:
         print u"%s: [%s - %s()] %s:" % (type.upper(), cls_name, method_name, message)
     
     # add extra params
     extras = {
         'class': cls_name,
         'method': method_name,
     }
     
     # determine the logging type and set as required - options are info, warning, error
     if type == 'info':
         if self.debug:
             logger.setLevel(logging.INFO)
             logger.info(jd_utils.encoding(message), extra=extras)
             
     elif type == 'warning':
         logger.setLevel(logging.WARNING)
         logger.warning(message, extra=extras)
         
     elif type == 'error':
         logger.setLevel(logging.ERROR)
         logger.error(message, extra=extras)
         raise Exception("%s: %s" % (type.upper(), message))
         
     elif type == 'critical':
         logger.setLevel(logging.CRITICAL)
         logger.critical(message, extra=extras)
         raise Exception("%s: %s" % (type.upper(), message))

Example #2

0

Show file

File: jd_spider.py Project: dream1986/jd_spider

def get_product_ids(url, jdb, tid):
    flag = 0
    while True:
        try:
            request = urllib.request.Request(url, headers = jd_headers)
            g_response = urllib.request.urlopen(request)
            if g_response.info().get('Content-Encoding') == 'gzip':
                g_read = zlib.decompress(g_response.read(), 16+zlib.MAX_WBITS)
            else:    
                g_read = g_response.read()
                
            url_html = jd_utils.encoding(g_read)
            
            url_soup = BeautifulSoup(url_html)
            url_extend = url_soup.findAll('a', attrs = {"href": re.compile(r"^http://\w+.jd.com/.+\.(htm|html)$")})
            break
        except http.client.IncompleteRead:
            continue
        except Exception as e:
            if flag > 3 :
                print ("网络异常，放弃展开URL") 
                return
            if e.errno == errno.ECONNRESET:
                flag = flag + 1
                time.sleep(20)
                print ("重试中...[%d]" % flag)
                continue
            print ("展开产品链接异常:" + str(e) )
            return
    
    prds = []
    no_prds = []
    for url_item in url_extend:
        url_str = url_item.get("href")
        m = re.match(r'^http://item.jd.com/\d+.html$', url_str)
        if m:
            #jdb.db_insert_product(m.string)
            prds.append( m.string )           
        else:
            #("http://red.jd.com/", "http://tuan.jd.com/", "http://auction.jd.com/", "http://jr.jd.com/", "http://smart.jd.com/")
            if not re.match(r'^http://(help|red|tuan|auction|jr|smart|gongyi|app|en|media|m|myjd|chat|read|chongzhi|z|giftcard|fw|you|mobile|wiki|me).jd.com', url_str) and not re.match(r'^http://www.jd.com/compare/', url_str) and not re.match(r'^http://club.jd.com/consultation/', url_str) :
                no_prds.append( url_str )              
                #with gdb_lock:
                #    jdb.db_insert_no_product(url_str)
        
    # Really need to do with database
    if prds or no_prds :
        with gdb_lock:
            print ('线程[%d] 插入数据库...' % tid)
            for item in prds:
                jdb.db_insert_product(item)
            for item in no_prds:
                jdb.db_insert_no_product(item)

Example #3

0

Show file

File: jd_spider.py Project: dream1986/jd_spider

    def get_product_consults(self, product_url):
        result_path = jd_config.JDSPR_RESULT
        self.agent = None
        page_id = 1    
        product_id = int(product_url.split('.')[2].split('/')[1])
        product_url = jd_item_url % product_id
        
        flag = 0
        while True:
            try:
                self.agent = random_jd_header(product_url)
                request = urllib.request.Request(product_url, headers = self.agent)
                g_response = urllib.request.urlopen(request)
                if g_response.info().get('Content-Encoding') == 'gzip':
                    g_read = zlib.decompress(g_response.read(), 16+zlib.MAX_WBITS)
                else:    
                    g_read = g_response.read()
                
                product_html = jd_utils.encoding(g_read)
                #操作正常
                break
            except UnicodeDecodeError:
                print ("GBK/Unicode编解码错误!")
                return
            except http.client.IncompleteRead:
                continue
            except Exception as e:
                if flag > 3 :
                    print ("咨询线程[%d]网络异常，放弃该产品" % self.tid) 
                    return
                if e.errno == errno.ECONNRESET:
                    flag = flag + 1
                    time.sleep(10)
                    print ("咨询线程[%d]重试中...[%d]"%( self.tid, flag) )
                    continue
                print ("1.其它异常:"+str(e))
                return
	
        product_name = None
        product_ts = None
        product_soup = BeautifulSoup(product_html)
        product_name = product_soup.find('h1')
        
        #产品类别
        product_type = product_soup.find('div', attrs={"class":"breadcrumb"})
        if product_type:
            product_ts = product_type.findAll('a')
        
        if not product_name or not product_ts:
            print("1.产品名称和类别提取错误，返回！Check[%s]" % product_url)
            print(self.agent['User-Agent'])
            return
        
        result_file = None
        try:
            i = 0    
            for pt_item in product_ts:
                if pt_item:
                    result_path = result_path + "/" + pt_item.string + "/"
                    i = i + 1
                    #目录类别的深度
                    if i > 3:
                        break
                else:
                    print("2.提取产品名称和目录错误！:%s, Check[%s]" % ( str(e), product_url) )
                    return
            
            if not os.path.exists(result_path):
                os.makedirs(result_path) 
            result_file = "%s/%d.txt"%(result_path,product_id)
            if os.path.exists(result_file):
                return
            
            #print ("产品保存地址：%s",result_path)    
            print ("咨询线程[%d]正在处理商品 %d" % ( self.tid, product_id ))
            f = codecs.open(result_file, 'wb',encoding = 'utf-8')   
            f.write("产品名称：" + product_name.text + "\n")
        except Exception as e:
            print("3.提取产品名称和目录错误！:%s, Check[%s]" % ( str(e), product_url) )
            if result_file and os.path.exists(result_file):
                try:
                    os.remove(result_file)
                except:
                    pass
            return
        count = 0 
        while  True:
            product_consult_url = jd_consult_url % ( product_id, page_id )
            flag = 0
            progress = "."
            while True:
                progress = progress + "."
                try:
                    self.agent = random_jd_header(product_url)
                    request = urllib.request.Request(product_consult_url, headers = self.agent)
                    g_response = urllib.request.urlopen(request)
                    if g_response.info().get('Content-Encoding') == 'gzip':
                        g_read = zlib.decompress(g_response.read(), 16+zlib.MAX_WBITS)
                    else:    
                        g_read = g_response.read()
                
                    consult_html = jd_utils.encoding(g_read)
                
                    #操作正常
                    break
                except UnicodeDecodeError:
                    print ("GBK/Unicode编解码错误!")
                    f.close()
                    return		
                except http.client.IncompleteRead:
                    continue	
                except Exception as e:
                    if flag > 3 :
                        print ("咨询线程[%d]网络异常，放弃该产品" % self.tid) 
                        f.close()
                        return
                    if e.errno == errno.ECONNRESET:
                        flag = flag + 1
                        time.sleep(10)
                        print ("咨询线程[%d]重试中...[%d]" % ( self.tid, flag) )
                        continue
                    print ("2.其它异常:"+str(e))
                    f.close()
                    return
            
            consult_soup = BeautifulSoup(consult_html)
            count = count + self.get_page_consult(consult_soup, f)
            if count == 0 and progress == "..":
                print("咨询线程[%d] - 商品咨询为空，删除商品文件:%s" %( self.tid , result_file))
                if result_file and os.path.exists(result_file):
                    try:
                        os.remove(result_file)
                    except:
                        pass
                return
                
            pagination = consult_soup.find('div', attrs = {"class":"Pagination"})
            if not pagination:
                break;
            if not pagination.findAll('a',attrs = {"class":"next"}) :
                break;
            else:
                page_id = page_id + 1;
                f.flush()
            
        print ("咨询线程[%d]处理完毕，咨询[%d] %d" % ( self.tid, count, product_id ))
        f.close()

Example #4

0

Show file

File: jd_comments.py Project: liulangjita/dust_repos

    def get_product_comments(self, product_url):
        result_path = ""
        self.agent = random_jd_header()
        page_id = 1    
        product_id = int(product_url.split('.')[2].split('/')[1])
        product_url = jd_item_url % product_id
        
        flag = 0
        while True:
            try:
                request = urllib.request.Request(product_url, headers = self.agent)
                g_response = urllib.request.urlopen(request)
                if g_response.info().get('Content-Encoding') == 'gzip':
                    g_read = zlib.decompress(g_response.read(), 16+zlib.MAX_WBITS)
                else:    
                    g_read = g_response.read()
                
                product_html = jd_utils.encoding(g_read)
                #操作正常
                break
            except UnicodeDecodeError:
                print ("GBK/Unicode编解码错误!")
                return
            except http.client.IncompleteRead:
                continue
            except Exception as e:
                if flag > 3 :
                    print ("评论线程[%d]网络异常，放弃该产品" % self.tid) 
                    return
                if e.errno == errno.ECONNRESET:
                    flag = flag + 1
                    time.sleep(10)
                    print ("评论线程[%d]重试中...[%d]"%( self.tid, flag) )
                    continue
                print ("1.其它异常:"+str(e))
                return
	
        product_name = None
        product_ts = None
        product_soup = BeautifulSoup(product_html)
        product_name = product_soup.find('h1')
        
        #产品类别
        product_type = product_soup.find('div', attrs={"class":"breadcrumb"})
        if product_type:
            product_ts = product_type.findAll('a')
        
        if not product_name or not product_ts:
            print("1.产品名称和类别提取错误，返回！Check[%s]" % product_url)
            print(self.agent['User-Agent'])
            return
        
        result_file = None
        try:
            i = 0    
            for pt_item in product_ts:
                if pt_item:
                    result_path = result_path + "/" + pt_item.string + "/"
                    i = i + 1
                    #目录类别的深度
                    if i > 3:
                        break
                else:
                    print("2.提取产品名称和目录错误！:%s, Check[%s]" % ( str(e), product_url) )
                    return
            
            #if not os.path.exists(result_path):
            #    os.makedirs(result_path) 
            result_file = "%s/%d_comm.txt"%(jd_config.JDSPR_RESULT_LOCAL, product_id)
            if os.path.exists(result_file):
                return
            
            #print ("产品保存地址：%s",result_path)    
            print ("评论线程[%d]正在处理商品 %d" % ( self.tid, product_id ))
            f = codecs.open(result_file, 'wb',encoding = 'utf-8')   
            f.write("产品名称：" + product_name.text + "\n")
        except Exception as e:
            print("3.提取产品名称和目录错误！:%s, Check[%s]" % ( str(e), product_url) )
            if result_file and os.path.exists(result_file):
                try:
                    os.remove(result_file)
                except:
                    pass
            return
            
        count = 0 
        retries = 0
        while  True:
            # random page url to avoid block
            product_comment_url = jd_comment_url % ( product_id, random.randint(3,10), page_id )
            #print ("=============> DOING... " + product_comment_url)
            flag = 0
            progress = "."
            while True:
                progress = progress + "."
                try:
                    self.agent = random_jd_header(product_comment_url)
                    request = urllib.request.Request(product_comment_url, headers = self.agent)
                    g_response = urllib.request.urlopen(request)
                    
                    if g_response.info().get('Content-Encoding') == 'gzip':
                        g_read = zlib.decompress(g_response.read(), 16+zlib.MAX_WBITS)
                    else:    
                        g_read = g_response.read()
                    
                    comment_html = jd_utils.encoding(g_read)
                
                    #操作正常
                    break
                except UnicodeDecodeError:
                    print ("GBK/Unicode编解码错误!")
                    f.close()
                    return
                except http.client.IncompleteRead:
                    continue
                except Exception as e:
                    if flag > 3 :
                        print ("评论线程[%d]网络异常，放弃该产品" % self.tid) 
                        f.close()
                        return
                    if e.errno == errno.ECONNRESET:
                        flag = flag + 1
                        time.sleep(2)
                        print ("评论线程[%d]重试中...[%d]" % ( self.tid, flag) )
                        continue
                    print ("2.其它异常:"+str(e))
                    f.close()
                    return
            
            comment_soup = BeautifulSoup(comment_html)
            count_t = self.get_page_comment(comment_soup, product_comment_url , f)
            
            # Retry about max 10 times here:
            # I hate JD
            lucky_flag = 1
            if count_t == 0:
                if retries < 10:
                    retries = retries + 1
                    if count != 0:
                        # Refresh user agent
                        self.agent = random_jd_header(product_comment_url)
                        time.sleep( random.randint(3, 9))
                    print("评论线程[%d] R[%d] %s" %( self.tid, retries, product_comment_url))               
                    continue
                else:
                    lucky_flag = 0
            
            retries = 0
            count = count + count_t
            
            if count == 0 and progress == "..":
                print("评论线程[%d] - 商品咨询为空，删除商品文件:%s" %( self.tid , result_file))
                if result_file and os.path.exists(result_file):
                    try:
                        os.remove(result_file)
                    except:
                        pass
                return                  
            
                           
            pagination = comment_soup.find('div', attrs = {"class":"pagin fr"})
            if not pagination:
                break
            if not pagination.findAll('a',attrs = {"class":"next"}) :
                break
            else:
                page_id = page_id + 1;
                f.flush()
            
        print ("评论线程[%d]处理完毕，产品[%d]，评论[%d]，LUCK[%s]，PATH[%s]" % ( self.tid, product_id, count, lucky_flag, result_path ))
        f.close()
        return (product_url, count, lucky_flag, result_path, product_id)

Example #5

0

Show file

def get_product_ids(url, jdb, tid):
    flag = 0
    while True:
        try:
            request = urllib.request.Request(url, headers=jd_headers)
            g_response = urllib.request.urlopen(request)
            if g_response.info().get('Content-Encoding') == 'gzip':
                g_read = zlib.decompress(g_response.read(),
                                         16 + zlib.MAX_WBITS)
            else:
                g_read = g_response.read()

            url_html = jd_utils.encoding(g_read)

            url_soup = BeautifulSoup(url_html)
            url_extend = url_soup.findAll(
                'a',
                attrs={
                    "href": re.compile(r"^http://\w+.jd.com/.+\.(htm|html)$")
                })
            break
        except http.client.IncompleteRead:
            continue
        except Exception as e:
            if flag > 3:
                print("网络异常，放弃展开URL")
                return
            if e.errno == errno.ECONNRESET:
                flag = flag + 1
                time.sleep(20)
                print("重试中...[%d]" % flag)
                continue
            print("展开产品链接异常:" + str(e))
            return

    prds = []
    no_prds = []
    for url_item in url_extend:
        url_str = url_item.get("href")
        m = re.match(r'^http://item.jd.com/\d+.html$', url_str)
        if m:
            #jdb.db_insert_product(m.string)
            prds.append(m.string)
        else:
            #("http://red.jd.com/", "http://tuan.jd.com/", "http://auction.jd.com/", "http://jr.jd.com/", "http://smart.jd.com/")
            if not re.match(
                    r'^http://(help|red|tuan|auction|jr|smart|gongyi|app|en|media|m|myjd|chat|read|chongzhi|z|giftcard|fw|you|mobile|wiki|me).jd.com',
                    url_str) and not re.match(
                        r'^http://www.jd.com/compare/',
                        url_str) and not re.match(
                            r'^http://club.jd.com/consultation/', url_str):
                no_prds.append(url_str)
                #with gdb_lock:
                #    jdb.db_insert_no_product(url_str)

    # Really need to do with database
    if prds or no_prds:
        with gdb_lock:
            print('线程[%d] 插入数据库...' % tid)
            for item in prds:
                jdb.db_insert_product(item)
            for item in no_prds:
                jdb.db_insert_no_product(item)