def extra_any_json(rawData: str, ruler: str, cap=None, list_path: list = None) -> list: elementlist = [] rulers = ExtraHtml.ruler_killer(ruler) json = None if cap != None: if isinstance(cap, list): rawData = StringHelper.extra_a_to_b_x(rawData, cap[0], cap[1]) json = demjson.decode(rawData) elif isinstance(cap, str): json = demjson.decode(rawData) json = ExtraJSON.dic_dip_extra(json, cap) else: json = demjson.decode(rawData) if list_path is not None: json = StringHelper.dic_looper(json, list_path) for item in json: #循环 json 字典 try: dic = dict() for rul in rulers: #循环 ruler 列表 res = ExtraJSON.ruler_finder(item, rul[1]) if res is not None: dic[rul[0]] = res elementlist.append(dic) except Exception as e: print(e) return elementlist
def looper_html(self, result, raw, exists, ruler, extra3_tup, extra4_tup): tup = Sh.str_to_dictup(extra3_tup) tag = tup[0] dic = tup[1] parent_container = (tag, dic) tup = Sh.str_to_dictup(extra4_tup) tag = tup[0] dic = tup[1] list_tup = (tag, dic) # 获取 list 提取成 字典 list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup) for item in list: if len(item) < 1: continue dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler) """日常抓取时的重复验证""" try: link = dic_list['link'] except: break if exists.count(link) < 1: result.append(dic_list) else: break
def get_txt_page_encode(raw:str) -> str: try: encode = StringHelper.extra_equ_value(raw, 'charset', '"') encode = StringHelper.cutfrom(encode, '/') return encode except HttpConnectionFailedException as e: raise e except: return ''
def get_page_encode(self, url): try: raw = self.req.get(url) encode = StringHelper.extra_equ_value(raw, 'charset', '"') encode = StringHelper.cutfrom(encode,'/') return encode except HttpConnectionFailedException as e: raise e except: return ''
def ruler_finder_multi_at(tag, ruler2): result = None pair = ruler2.split('@', 1) tup = Sh.str_to_dictup(pair[1]) _tag = tup[0] dic = tup[1] parent_container = (_tag, dic) tup = Sh.str_to_dictup(pair[0]) _tag = tup[0] dic = tup[1] list_tup = (_tag, dic) p_tag = parent_container[0] p_dic = parent_container[1] s_ruler = '' for key in p_dic: s_ruler = p_tag + ' ' + key + '=' + p_dic[key] break ruler_pair = ('', s_ruler) ruler_pair_part_of = ruler_pair[1].split(' ', 2) identify = ruler_pair_part_of[1].split('=', 1) if tag.name != ruler_pair_part_of[0]: return result try: if tag.attrs[identify[0]] == identify[1]: result = [] raw = str(tag) # 获取 list 提取成 字典 list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup) for item in list: if len(item) < 1: continue dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler) result.append(dic_list) return result except: result = None return result
def finder_sohu(self, raw): try: count = s.cut_tail(raw.split('=')[1], ';') return {'playCount': count} except: pass
def extra_any_json_dic(rawData, ruler, cap=None): elementlist = {} rulers = ExtraHtml.ruler_killer(ruler) json = None if cap != None: if isinstance(cap, list): rawData = StringHelper.extra_a_to_b_x(rawData, cap[0], cap[1]) json = demjson.decode(rawData) elif isinstance(cap, str): json = demjson.decode(rawData) json = ExtraJSON.dic_dip_extra(json, cap) else: json = demjson.decode(rawData) # for item in json: try: for rul in rulers: res = ExtraJSON.ruler_finder(json, rul[1]) if res is not None: elementlist[rul[0]] = res except Exception as e: print(e) return elementlist
def extra_any_json_ex(rawData, ruler, cap=None): dic = dict() if cap != None: rawData = StringHelper.extra_a_to_b_x(rawData, cap[0], cap[1]) rulers = ExtraHtml.ruler_killer(ruler) json = demjson.decode(rawData) for key in json: try: # dic = dict() for rul in rulers: res = None if isinstance(json[key], dict): res = ExtraJSON.ruler_finder(json[key], rul[1]) elif key == rul[1]: res = json[key] if res is not None: dic[rul[0]] = res # elementlist.append(dic) except Exception as e: print(e) return dic
def go_json(rawData, ruler, cap=None): """单条json 提取""" dic = dict() if cap != None: rawData = StringHelper.extra_a_to_b_x(rawData, cap[0], cap[1]) rulers = ExtraHtml.ruler_killer_ex(ruler) json = demjson.decode(rawData) list = [] ExtraJSON.dic_to_list(json, list) for item in list: for ruler in rulers: if item.count(ruler[1]) > 0: dic[ruler[0]] = StringHelper.cut_head(item, ruler[1] + ' ') continue return dic
def looper_html_ex(self, result, raw, ruler, extra3_tup, extra4_tup): tup = Sh.str_to_dictup(extra3_tup) tag = tup[0] dic = tup[1] parent_container = (tag, dic) tup = Sh.str_to_dictup(extra4_tup) tag = tup[0] dic = tup[1] list_tup = (tag, dic) # 获取 list 提取成 字典 list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup) for item in list: if len(item) < 1: continue dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler) result.append(dic_list)
def looper_js(self, result:list, raw:str, exists:list, ruler:str, extra3_tup:str=None, list_json_path:str=None, identifier_key:str=None): # cap = ('data_callback(', ')') cap = None path = None if identifier_key is None: identifier_key = 'link' if extra3_tup is not None: cap = Sh.str_to_tup(extra3_tup) elif list_json_path is not None: path = Sh.separator(list_json_path, '->') list = ExtraJSON.extra_any_json(raw, ruler, cap=cap, list_path=path) if len(list) > 0: for item in list: """日常抓取时的重复验证""" if exists.count(item[identifier_key]) < 1: result.append(item) else: break
def find_constant(self): self.part_second = StringHelper.url_divider(self.second) self.part_third = StringHelper.url_divider(self.third) for i in range(0,len(self.part_second)): if self.part_second[i] != self.part_third[i]: self.out_index = i break self.constant_sec = self.part_second[self.out_index] constant_thi = self.part_third[self.out_index] self.list_sec = self.find_num_ex(self.constant_sec) self.list_thi = self.find_num_ex(constant_thi) for i in range(0, len(self.list_sec)): if self.list_sec[i] != self.list_thi[i]: self.variable_sec = self.list_sec[i] if self.list_sec[i] < self.list_thi[i]: self.to_right = True else: self.to_right = False
def finder_need_tag(tag, need_tag=False, next_sibling=False, next_sibling_text=False, get_text=False, ruler_pair:()=None): if need_tag: return tag elif next_sibling_text: return tag.next_sibling.text elif next_sibling: if ruler_pair is not None: ruler_pair[0] = Sh.cut_tail(ruler_pair[0], '-') return (ruler_pair[0], str(tag.next_sibling)) elif get_text: return tag.get_text() else: return tag.text
def check_shutdown_status(): try: file_name = Configs().system_shutdown_flag_filename status = int(StringHelper.trim(FileHelper.read(file_name))) if status == 1: somebody_help.reset_shutdown_status() return True else: return False except: LogGo.error("system_shutdown_flag_file unavailable!") return False
def container_process(self, raw:str, tribleStar:str) -> str: """ 前后剪裁 :param extra4: :param raw: :return: """ res = raw if tribleStar is not None and tribleStar != '': parts = tribleStar.split('***') res = Sh.strip_head_tail(raw, parts[0], parts[1]) return res
def looper_js(self, result, raw, exists, ruler, captup=None): cap = captup if captup != None: if captup.count(' ') == 2: cap = Sh.str_to_tup(captup) list = ExtraJSON.extra_any_json(raw, ruler, cap=cap) if len(list) > 0: for item in list: """日常抓取时的重复验证""" if 1>0:#if exists.count(item['link']) < 1: result.append(item) else: break
def purify_a(tag): if tag.name == 'a': try: # print(tag) data_ue_src = tag.attrs['data_ue_src'] sn = StringHelper.extra_a_to_b(data_ue_src, 'sn=', '&') sn = sn[3:len(sn) - 1] tag.attrs['data_ue_src'] = sn tag.attrs['href'] = sn # print(" ") # print(tag) except: pass for _tag in tag.contents: if isinstance(_tag, Tag): ExtraJSON.purify_a(_tag)
def generate_url(self,index): """ :param index: :return: """ if index <= 3: return None ref = int(self.variable_sec[0]) dis = index - 2 if self.to_right: ref += dis else: ref -= dis self.part_third[self.out_index] = StringHelper.exchange(self.constant_sec, self.variable_sec[0], str(ref), self.variable_sec[1]) url = "/".join(self.part_third) return url
def ruler_finder_condition_content(ruler:str, extract=False) -> bool: """ 判断 ruler 是否为 [你好] :param ruler: :return: """ type1 = '[' type2 = ']' result = False try: if ruler.count(type1) == 1 and ruler.count(type2) == 1 and ruler.startswith(type1): if extract: result = Sh.extra_a_to_b_x(ruler, type1, type2) else: result = True except: pass if result is False and extract: raise Exception("未找到指定 字符") return result
def scan(self, target, order): result = [] type = self.td(target) url = target.extra0 #'http://ent.people.com.cn/GB/81374/index1.html' cap = None ruler = None if self.td(target) == 'i': cap = ['var tvInfoJs=', ''] url = self.iqiyi_base.format(url) ruler = 'keywords:contentKeyword;latestOrder:latestOrder;name:name;playCount:playCount;score:score;videoCount:videoCount' elif type == 'l': ruler = 'score:plist_score;comments:pcommon_count;bullets:pdm_count;like:up;hate:down;playCount:plist_play_count' url = self.letv_base.format(url, target.extra1) elif type == 't': cap = ["tlux.dispatch('$cover',", ");"] ruler = 'score:score->score;playCount:view_all_count;videoCount:episode_all;latestOrder:episode_updatedd' url = self.qq_base.format(url) elif type == 'm': url = self.mgtv_base.format(url) cap = ['"data":', ',"msg"'] ruler = 'playCount:all;like:like;hate:unlike' elif type == 'y': ruler = 'playCount:li [总播放数];comments:li [评论];like:li [顶];score:span class=star-num' if not s.is_url(url): if not url.startswith('id'): url = self.youku_prefix.format(url) url = self.youku_base.format(url) elif type == 's': url = self.sohu_base.format(url) elif type == 'c': url = self.cntv_base.format(url) ruler = 'playCount:^label [播放次数]' try: encode = ExtraHtml.get_page_encode(url) if type == 'y' or type == 'c': result = self.looper_html(url, ruler, encode, target) else: raw = RequestHelper.get(url, encode=encode) if type == 's': result = self.finder_sohu(raw) else: result = self.looper_js(raw, ruler, cap) except AttributeError as e: pass except Exception as e: import traceback msg = traceback.format_exc() print(msg) LogGo.warning(repr(e)) if len(result) > 0: result = self.build_base_dic(target, result, order) return result[0]
def extract_dic_list_from_page(self, result, url, parent_attr, list_attr, ruler, exists:list=None, encode:str=None): """ 从页面提取 dic 列表 :param result: 结果列表 :param raw: :param exists: :param ruler: 字典规则 :param parent_attr: parent_container attribute :param list_attr: list attribute """ raw = self.req.get(url, encode=encode) tup = Sh.str_to_dictup(parent_attr) tag = tup[0] dic = tup[1] parent_container = (tag, dic) tup = Sh.str_to_dictup(list_attr) tag = tup[0] dic = tup[1] list_tup = (tag, dic) # 获取 list 提取成 字典 list = ExtraHtml.any_list_finder_ex(raw, parent_container, list_tup) for item in list: if len(item) < 1: continue dic_list = ExtraHtml.tag_list_to_ruler_list_ex(item, ruler) """抓取时的重复验证""" if exists is not None: if exists.count(dic_list['link']) < 1: result.append(dic_list) else: break else: result.append(dic_list) # 备份 1204 # @staticmethod # def ruler_finder_ex(tag, ruler_pair, need_tag=False): # result = None # ruler2 = ruler_pair[1] # # #在某个父tag 下取得所有相同类型的子tag # # 这个好像是个废掉的功能 # if ExtraHtml.ruler_finder_condition(ruler2) == 'at_loop': # re = ExtraHtml.ruler_finder_multi_at(tag, ruler2) # # if re is not None: # result = (ruler_pair[0], re) # return result # #依据父属性做查找 # elif ruler_pair[1].count(':') > 0 and ruler_pair[1].count(':') > ruler_pair[1].count('http:'): # status = False # _pair = ruler_pair[1].split(':', 1) # name = _pair[0]#可能 是子标签 # name_part = [] # # if name.count(' ') > 0:#可能 有属性 # name_part = name.split(' ', 1)#可能 属性 # name = name_part[0]#可能 标签 # # if tag.name != name:#可能 不是此标签,跳过 # return None # # parent_pair = _pair[1].split(' ', 1)#可能 父标签部分 # parent_name = parent_pair[0]#可能 父标签名 # # if tag.parent == None or not isinstance(tag.parent, Tag) or tag.parent.name != parent_name:#可能 被遍历的tag没有父元素 或者 父元素 不是 tag 或者 父元素名不对 跳过 # return None # # if len(parent_pair) == 1:#可能 没有添加属性 那么 父元素的要求已达到 # status = True # # parent_attr = parent_pair[1].split('=', 1)#父标签的属性 # # if len(parent_attr) == 1 or (len(parent_attr) > 1 and parent_attr[1] == None or parent_attr[1] == ''):#可能* 如果有父标签的属性 或者 等号后为空 # try: # attr = tag.parent.attrs[parent_attr[0]] # status = True # except: # pass # # if len(parent_attr) > 1 and parent_attr[1] != None or parent_attr[1] != '' : # try: # if ' '.join(tag.parent.attrs[parent_attr[0]]) == parent_attr[1]: # status = True # except: # pass # # if status: # if len(name_part) > 1 and name_part[1] != None and name_part != '': # return ExtraHtml.ruler_finder_ex(tag,(ruler_pair[0], _pair[0])) # else: # result = (ruler_pair[0], tag.text) # # #依据自身属性 # elif ruler_pair[1].count(' ') > 0 or ruler_pair[1].count('=') == 2: # if ruler_pair[1].count('=') == 2: # ruler_pair_part_of = ruler_pair[1].split(' ', 2) # identify = ruler_pair_part_of[2].split('=', 1) # # if tag.name != ruler_pair_part_of[0]: # return None # # try: # if tag.attrs[identify[0]] == identify[1]: # return ExtraHtml.ruler_finder_ex(tag, (ruler_pair[0], ruler_pair_part_of[0] + ' ' + ruler_pair_part_of[1])) # except: # pass # else: # _pair = ruler_pair[1].split(' ', 1) # _name = _pair[0] # # next_flag = False # # if _name.count('^') == 1: # next_flag = True # _name = Sh.cut_head(_name, '^') # # if tag.name != _name: # return None # # # 关键字查询 # keyword = Sh.extra_a_to_b_x(_pair[1], '[', ']') # if _pair[1].count('[') == 1 and _pair[1].count(']') == 1: # if keyword in tag.text and len(tag.contents) <= 1: # if next_flag: # return (ruler_pair[0], tag.next_sibling.text) # else: # return (ruler_pair[0], tag.text) # # equ_pair = _pair[1].split('=',1) # # _attr = equ_pair[0] # # val = StringHelper.extra_a_to_b(_attr, '(', ')') # _attr = StringHelper.delete_piece(_attr, val) # # for item in equ_pair: # if item == '': # equ_pair.remove(item) # # if len(equ_pair) > 1: # value = equ_pair[1] # # try: # att = tag.attrs[_attr] # _value = '' # if isinstance(att, str): # _value = att # elif isinstance(att, list): # _value = " ".join(tag.attrs[_attr]) # if _value == value: # result = (ruler_pair[0], tag.text) # # return (ruler_pair[0], tag.text) # except: # pass # else: # try: # val = val[1:len(val) - 1] # # ed = tag.attrs[_attr] # op = val # # com = op + ed # com = RulerExtra.canwecom(op, ed) # # # return (ruler_pair[0], com) # result = (ruler_pair[0], com) # except: # return None # # return (ruler_pair[0], tag.attrs[_attr]) # # # 不通过属性 直接查找 # else: # if tag.name == ruler_pair[1]: # if tag.text is not None and tag.text != '': # # return (ruler_pair[0], tag.text) # result = (ruler_pair[0], tag.text) # # # 星判断 # if result != None and len(result) > 1 and result[0].count('*') > 0: # key = '' # value = '' # value_list = [] # if ruler_pair[0].count('*') > 0: # if len(tag.contents) > 1:#多重判断 # for con in tag: # if isinstance(con, Tag): # #提取p标签 # if ruler_pair[0].count('***') == 1: # if con.name == 'p': # # value += con.get_text() # # value += ExtraHtml.ruler_finder_recursion_dig_for_p(con) # ExtraHtml.ruler_finder_recursion_dig_for_p(con, value_list) # # 删除所有标签 # elif ruler_pair[0].count('**') == 1: # value += con.get_text() # # 保留标签 # elif ruler_pair[0].count('*') == 1: # value += str(con) # # else:#单tag 判断 # if ruler_pair[0].count('**') == 1: # value += tag.get_text() # elif ruler_pair[0].count('*') == 1: # value += str(tag) # # #善后 # if ruler_pair[0].count('***') == 1: # # value = "".join(value.split()) # value = "".join(value_list) # value = "".join(value.split()) # key = StringHelper.cutfrom(ruler_pair[0], '***') # elif ruler_pair[0].count('**') == 1: # value = "".join(value.split()) # key = StringHelper.cutfrom(ruler_pair[0], '**') # elif ruler_pair[0].count('*') == 1: # key = StringHelper.cutfrom(ruler_pair[0], '*') # # elif ruler_pair[0].count('^') > 0: # value = tag.text # value = "".join(value.split()) # key = StringHelper.cutfrom(ruler_pair[0], '^') # else: # key = ruler_pair[0] # value = tag.text # # if value == '': # print('info: in ruler_finder_ex star filter -> ' + result[0] + ' got empty result!') # value = result[1] # # return (key, value) # elif result == None: # return None # else: # return result
def ruler_finder_ex(tag, ruler_pair, need_tag=False): """ html 页面元素抓取核心函数 :param tag: 待遍历 的 soup tag 元素 :param ruler_pair: 元素寻找规则 :param need_tag: 如果为 true 则返回 找到的 子 tag 元素 :return: """ result = None ruler2 = ruler_pair[1] next_sibling = False # 判断是否需要获取 nextSibling if ExtraHtml.ruler_finder_condition_next_sibling(ruler_pair[0]): next_sibling = True # 通过标签中间的内容判断 <span>你好</span> 则 ruler 为 [你好] if ExtraHtml.ruler_finder_condition_content(ruler2): try: if tag.get_text() == ExtraHtml.ruler_finder_condition_content(ruler2, True): if next_sibling: result = ExtraHtml.finder_need_tag(tag, need_tag, next_sibling=True, ruler_pair=ruler_pair) else: result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag)) return result except: pass #在某个父tag 下取得所有相同类型的子tag # 这个好像是个废掉的功能 elif ExtraHtml.ruler_finder_condition(ruler2) == 'at_loop': re = ExtraHtml.ruler_finder_multi_at(tag, ruler2) if re is not None: result = (ruler_pair[0], re) return result #依据父属性做查找 elif ruler_pair[1].count(':') > 0 and ruler_pair[1].count(':') > ruler_pair[1].count('http:'): status = False _pair = ruler_pair[1].split(':', 1) name = _pair[0]#可能 是子标签 name_part = [] if name.count(' ') > 0:#可能 有属性 name_part = name.split(' ', 1)#可能 属性 name = name_part[0]#可能 标签 if tag.name != name:#可能 不是此标签,跳过 return None # 可能 父标签部分 parent_pair = _pair[1].split(' ', 1) # 可能 父标签名 parent_name = parent_pair[0] # 可能 被遍历的tag没有父元素 或者 父元素 不是 tag 或者 父元素名不对 跳过 if tag.parent == None or not isinstance(tag.parent, Tag) or tag.parent.name != parent_name: return None # 可能 没有添加属性 那么 父元素的要求已达到 if len(parent_pair) == 1: status = True # 父标签的属性 parent_attr = parent_pair[1].split('=', 1) # 可能* 如果有父标签的属性 或者 等号后为空 if len(parent_attr) == 1 or (len(parent_attr) > 1 and parent_attr[1] == None or parent_attr[1] == ''): try: attr = tag.parent.attrs[parent_attr[0]] status = True except: pass if len(parent_attr) > 1 and parent_attr[1] != None or parent_attr[1] != '' : try: if ' '.join(tag.parent.attrs[parent_attr[0]]) == parent_attr[1]: status = True except: pass if status: if len(name_part) > 1 and name_part[1] != None and name_part != '': return ExtraHtml.ruler_finder_ex(tag,(ruler_pair[0], _pair[0]), need_tag) else: # result = (ruler_pair[0], tag.text) result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag)) #依据自身属性 elif ruler_pair[1].count(' ') > 0 or ruler_pair[1].count('=') == 2: if ruler_pair[1].count('=') == 2: ruler_pair_part_of = ruler_pair[1].split(' ', 2) identify = ruler_pair_part_of[2].split('=', 1) if tag.name != ruler_pair_part_of[0]: return None try: if tag.attrs[identify[0]] == identify[1]: return ExtraHtml.ruler_finder_ex(tag, (ruler_pair[0], ruler_pair_part_of[0] + ' ' + ruler_pair_part_of[1]), need_tag) except: pass else: _pair = ruler_pair[1].split(' ', 1) _name = _pair[0] next_flag = False if _name.count('^') == 1: next_flag = True _name = Sh.cut_head(_name, '^') if tag.name != _name: return None # 关键字查询 keyword = Sh.extra_a_to_b_x(_pair[1], '[', ']') if _pair[1].count('[') == 1 and _pair[1].count(']') == 1: if keyword in tag.text and len(tag.contents) <= 1: if next_flag: # return (ruler_pair[0], tag.next_sibling.text) return (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag, True)) else: # return (ruler_pair[0], tag.text) return (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag)) equ_pair = _pair[1].split('=',1) _attr = equ_pair[0] val = StringHelper.extra_a_to_b(_attr, '(', ')') _attr = StringHelper.delete_piece(_attr, val) for item in equ_pair: if item == '': equ_pair.remove(item) if len(equ_pair) > 1: value = equ_pair[1] try: att = tag.attrs[_attr] _value = '' if isinstance(att, str): _value = att elif isinstance(att, list): _value = " ".join(tag.attrs[_attr]) if _value == value: # result = (ruler_pair[0], tag.text) result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag)) except: pass else: try: val = val[1:len(val) - 1] ed = tag.attrs[_attr] op = val # com = op + ed com = RulerExtra.canwecom(op, ed) result = (ruler_pair[0], com) except: return None # return (ruler_pair[0], tag.attrs[_attr]) # 不通过属性 直接查找 else: if tag.name == ruler_pair[1]: if tag.text is not None and tag.text != '': # result = (ruler_pair[0], tag.text) result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag)) # 星判断 if result != None and len(result) > 1 and result[0].count('*') > 0: key = '' value = '' value_list = [] if ruler_pair[0].count('*') > 0: if len(tag.contents) > 1:#多重判断 for con in tag: if isinstance(con, Tag): #提取p标签 if ruler_pair[0].count('***') == 1: if con.name == 'p': # value += con.get_text() # value += ExtraHtml.ruler_finder_recursion_dig_for_p(con) ExtraHtml.ruler_finder_recursion_dig_for_p(con, value_list) # 删除所有标签 elif ruler_pair[0].count('**') == 1: value += con.get_text() # 保留标签 elif ruler_pair[0].count('*') == 1: value += str(con) else:#单tag 判断 if ruler_pair[0].count('**') == 1: value += tag.get_text() elif ruler_pair[0].count('*') == 1: value += str(tag) #善后 if ruler_pair[0].count('***') == 1: # value = "".join(value.split()) value = "".join(value_list) value = "".join(value.split()) key = StringHelper.cutfrom(ruler_pair[0], '***') elif ruler_pair[0].count('**') == 1: value = "".join(value.split()) key = StringHelper.cutfrom(ruler_pair[0], '**') elif ruler_pair[0].count('*') == 1: key = StringHelper.cutfrom(ruler_pair[0], '*') elif ruler_pair[0].count('^') > 0: value = tag.text value = "".join(value.split()) key = StringHelper.cutfrom(ruler_pair[0], '^') else: key = ruler_pair[0] value = tag.text if value == '': print('info: in ruler_finder_ex star filter -> ' + result[0] + ' got empty result!') value = result[1] return (key, value) elif result == None: return None else: return result