def decrypt(self,soup,cls_dict,css_dict,pattern='.*',comment=False): ''' soup:加密的标签 cls_dict,css_dict:解析css文件得到的解密字典。 pattern:解密后的正则匹配模式,会匹配解密后的文本,将符合正则的内容返回 comment:当前标签是否属于点评评论标签。有些不是属于评论内容的标签不用置True ''' _contents = soup.contents _ = [] while _contents: i = _contents.pop(0) if isinstance(i, Tag): if i.name in DECRYPT_TAGS: if i['class'][0] in IGNORED_SPAN_CLASS: continue if i['class'][0] == 'item': i_contents = i.contents for j in reversed(i_contents): _contents.insert(0,j) continue i = self._get_decrypted(i,cls_dict,css_dict,comment) elif not isinstance(i, str): continue _.append(i) text = _clean(_) return from_pattern(pattern, text)
def get_full_phone(phone_str,cityId): if not phone_str: return res = [] _phone =[i.strip() for i in phone_str.split('\xa0') if i] for i in _phone: if from_pattern(PATTERN_PHONE,i): res.append(i) else: code = get_city_areacode(cityId) res.append('-'.join([code,i])) return res
def decrypt(self, soup, cls_dict, css_dict, pattern='.*', comment=False): _contents = soup.contents _ = [] while _contents: i = _contents.pop(0) if isinstance(i, Tag): if i.name in DECRYPT_TAGS: if i['class'][0] in IGNORED_SPAN_CLASS: continue if i['class'][0] == 'item': i_contents = i.contents for j in reversed(i_contents): _contents.insert(0, j) continue i = self._get_decrypted(i, cls_dict, css_dict, comment) elif not isinstance(i, str): continue _.append(i) text = _clean(_) return from_pattern(pattern, text)