Esempio n. 1
0
 def decrypt(self,soup,cls_dict,css_dict,pattern='.*',comment=False):
     '''
     soup:加密的标签
     cls_dict,css_dict:解析css文件得到的解密字典。
     pattern:解密后的正则匹配模式,会匹配解密后的文本,将符合正则的内容返回
     comment:当前标签是否属于点评评论标签。有些不是属于评论内容的标签不用置True
     '''
     _contents = soup.contents
     _ = []
     while _contents:
         i = _contents.pop(0)
         if isinstance(i, Tag):
             if i.name in DECRYPT_TAGS:
                 if i['class'][0] in IGNORED_SPAN_CLASS:
                     continue
                 if i['class'][0] == 'item':
                     i_contents = i.contents
                     for j in reversed(i_contents):
                         _contents.insert(0,j)
                     continue
                 i = self._get_decrypted(i,cls_dict,css_dict,comment)
         elif not isinstance(i, str):
             continue
         _.append(i)
     text =  _clean(_)
     return from_pattern(pattern, text)
Esempio n. 2
0
def get_full_phone(phone_str,cityId):
    if not phone_str:
        return
    res = []
    _phone =[i.strip() for i in  phone_str.split('\xa0') if i]
    for i in _phone:
        if from_pattern(PATTERN_PHONE,i):
            res.append(i)
        else:
            code = get_city_areacode(cityId)
            res.append('-'.join([code,i]))
    return res
Esempio n. 3
0
 def decrypt(self, soup, cls_dict, css_dict, pattern='.*', comment=False):
     _contents = soup.contents
     _ = []
     while _contents:
         i = _contents.pop(0)
         if isinstance(i, Tag):
             if i.name in DECRYPT_TAGS:
                 if i['class'][0] in IGNORED_SPAN_CLASS:
                     continue
                 if i['class'][0] == 'item':
                     i_contents = i.contents
                     for j in reversed(i_contents):
                         _contents.insert(0, j)
                     continue
                 i = self._get_decrypted(i, cls_dict, css_dict, comment)
         elif not isinstance(i, str):
             continue
         _.append(i)
     text = _clean(_)
     return from_pattern(pattern, text)