Python StringHelper.cut_head Exemples

Langage de programmation: Python

Espace de nommage/Pack: CoTec.utility.string.string_go

Class/Type: StringHelper

Méthode/Fonction: cut_head

Exemples au hotexamples.com: 2

Python StringHelper.cut_head - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de CoTec.utility.string.string_go.StringHelper.cut_head extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

extra_a_to_b_x(6)

str_to_dictup(4)

cutfrom(3)

cut_head(2)

cut_tail(2)

extra_a_to_b(2)

extra_equ_value(2)

str_to_tup(2)

delete_piece(1)

dic_looper(1)

exchange(1)

is_url(1)

separator(1)

strip_head_tail(1)

trim(1)

url_divider(1)

Méthodes fréquemment utilisées

extra_a_to_b_x (6)

str_to_dictup (4)

cutfrom (3)

cut_head (2)

cut_tail (2)

extra_a_to_b (2)

extra_equ_value (2)

str_to_tup (2)

delete_piece (1)

dic_looper (1)

Méthodes fréquemment utilisées

exchange (1)

is_url (1)

separator (1)

strip_head_tail (1)

trim (1)

url_divider (1)

Exemple #1

0

Afficher le fichier

def go_json(rawData, ruler, cap=None): """单条json 提取""" dic = dict() if cap != None: rawData = StringHelper.extra_a_to_b_x(rawData, cap[0], cap[1]) rulers = ExtraHtml.ruler_killer_ex(ruler) json = demjson.decode(rawData) list = [] ExtraJSON.dic_to_list(json, list) for item in list: for ruler in rulers: if item.count(ruler[1]) > 0: dic[ruler[0]] = StringHelper.cut_head(item, ruler[1] + ' ') continue return dic

Exemple #2

0

Afficher le fichier

Fichier : html_scraper.py Projet : Season02/Spezia2

def ruler_finder_ex(tag, ruler_pair, need_tag=False): """ html 页面元素抓取核心函数 :param tag: 待遍历的 soup tag 元素 :param ruler_pair: 元素寻找规则 :param need_tag: 如果为 true 则返回找到的子 tag 元素 :return: """ result = None ruler2 = ruler_pair[1] next_sibling = False # 判断是否需要获取 nextSibling if ExtraHtml.ruler_finder_condition_next_sibling(ruler_pair[0]): next_sibling = True # 通过标签中间的内容判断 <span>你好</span> 则 ruler 为 [你好] if ExtraHtml.ruler_finder_condition_content(ruler2): try: if tag.get_text() == ExtraHtml.ruler_finder_condition_content(ruler2, True): if next_sibling: result = ExtraHtml.finder_need_tag(tag, need_tag, next_sibling=True, ruler_pair=ruler_pair) else: result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag)) return result except: pass #在某个父tag 下取得所有相同类型的子tag # 这个好像是个废掉的功能 elif ExtraHtml.ruler_finder_condition(ruler2) == 'at_loop': re = ExtraHtml.ruler_finder_multi_at(tag, ruler2) if re is not None: result = (ruler_pair[0], re) return result #依据父属性做查找 elif ruler_pair[1].count(':') > 0 and ruler_pair[1].count(':') > ruler_pair[1].count('http:'): status = False _pair = ruler_pair[1].split(':', 1) name = _pair[0]#可能是子标签 name_part = [] if name.count(' ') > 0:#可能有属性 name_part = name.split(' ', 1)#可能属性 name = name_part[0]#可能标签 if tag.name != name:#可能不是此标签，跳过 return None # 可能父标签部分 parent_pair = _pair[1].split(' ', 1) # 可能父标签名 parent_name = parent_pair[0] # 可能被遍历的tag没有父元素或者父元素不是 tag 或者父元素名不对跳过 if tag.parent == None or not isinstance(tag.parent, Tag) or tag.parent.name != parent_name: return None # 可能没有添加属性那么父元素的要求已达到 if len(parent_pair) == 1: status = True # 父标签的属性 parent_attr = parent_pair[1].split('=', 1) # 可能* 如果有父标签的属性或者等号后为空 if len(parent_attr) == 1 or (len(parent_attr) > 1 and parent_attr[1] == None or parent_attr[1] == ''): try: attr = tag.parent.attrs[parent_attr[0]] status = True except: pass if len(parent_attr) > 1 and parent_attr[1] != None or parent_attr[1] != '' : try: if ' '.join(tag.parent.attrs[parent_attr[0]]) == parent_attr[1]: status = True except: pass if status: if len(name_part) > 1 and name_part[1] != None and name_part != '': return ExtraHtml.ruler_finder_ex(tag,(ruler_pair[0], _pair[0]), need_tag) else: # result = (ruler_pair[0], tag.text) result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag)) #依据自身属性 elif ruler_pair[1].count(' ') > 0 or ruler_pair[1].count('=') == 2: if ruler_pair[1].count('=') == 2: ruler_pair_part_of = ruler_pair[1].split(' ', 2) identify = ruler_pair_part_of[2].split('=', 1) if tag.name != ruler_pair_part_of[0]: return None try: if tag.attrs[identify[0]] == identify[1]: return ExtraHtml.ruler_finder_ex(tag, (ruler_pair[0], ruler_pair_part_of[0] + ' ' + ruler_pair_part_of[1]), need_tag) except: pass else: _pair = ruler_pair[1].split(' ', 1) _name = _pair[0] next_flag = False if _name.count('^') == 1: next_flag = True _name = Sh.cut_head(_name, '^') if tag.name != _name: return None # 关键字查询 keyword = Sh.extra_a_to_b_x(_pair[1], '[', ']') if _pair[1].count('[') == 1 and _pair[1].count(']') == 1: if keyword in tag.text and len(tag.contents) <= 1: if next_flag: # return (ruler_pair[0], tag.next_sibling.text) return (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag, True)) else: # return (ruler_pair[0], tag.text) return (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag)) equ_pair = _pair[1].split('=',1) _attr = equ_pair[0] val = StringHelper.extra_a_to_b(_attr, '(', ')') _attr = StringHelper.delete_piece(_attr, val) for item in equ_pair: if item == '': equ_pair.remove(item) if len(equ_pair) > 1: value = equ_pair[1] try: att = tag.attrs[_attr] _value = '' if isinstance(att, str): _value = att elif isinstance(att, list): _value = " ".join(tag.attrs[_attr]) if _value == value: # result = (ruler_pair[0], tag.text) result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag)) except: pass else: try: val = val[1:len(val) - 1] ed = tag.attrs[_attr] op = val # com = op + ed com = RulerExtra.canwecom(op, ed) result = (ruler_pair[0], com) except: return None # return (ruler_pair[0], tag.attrs[_attr]) # 不通过属性直接查找 else: if tag.name == ruler_pair[1]: if tag.text is not None and tag.text != '': # result = (ruler_pair[0], tag.text) result = (ruler_pair[0], ExtraHtml.finder_need_tag(tag, need_tag)) # 星判断 if result != None and len(result) > 1 and result[0].count('*') > 0: key = '' value = '' value_list = [] if ruler_pair[0].count('*') > 0: if len(tag.contents) > 1:#多重判断 for con in tag: if isinstance(con, Tag): #提取p标签 if ruler_pair[0].count('***') == 1: if con.name == 'p': # value += con.get_text() # value += ExtraHtml.ruler_finder_recursion_dig_for_p(con) ExtraHtml.ruler_finder_recursion_dig_for_p(con, value_list) # 删除所有标签 elif ruler_pair[0].count('**') == 1: value += con.get_text() # 保留标签 elif ruler_pair[0].count('*') == 1: value += str(con) else:#单tag 判断 if ruler_pair[0].count('**') == 1: value += tag.get_text() elif ruler_pair[0].count('*') == 1: value += str(tag) #善后 if ruler_pair[0].count('***') == 1: # value = "".join(value.split()) value = "".join(value_list) value = "".join(value.split()) key = StringHelper.cutfrom(ruler_pair[0], '***') elif ruler_pair[0].count('**') == 1: value = "".join(value.split()) key = StringHelper.cutfrom(ruler_pair[0], '**') elif ruler_pair[0].count('*') == 1: key = StringHelper.cutfrom(ruler_pair[0], '*') elif ruler_pair[0].count('^') > 0: value = tag.text value = "".join(value.split()) key = StringHelper.cutfrom(ruler_pair[0], '^') else: key = ruler_pair[0] value = tag.text if value == '': print('info: in ruler_finder_ex star filter -> ' + result[0] + ' got empty result!') value = result[1] return (key, value) elif result == None: return None else: return result