def get_html(url): r = http_do(url, 'GET') if r.reason != 'ok': raise internet_except('get_html fail: url = {}'.format(url)) encoding = get_charset_from_content_type(r.head['content-type']) txt = r.data.decode(encoding) return txt
def format_html(html_txt): p = format_HTMLParser() try: p.feed(html_txt) except: raise internet_except(p.txt) return ''.join(str(e) for strs in p.txt for e in strs)
def download_image(fname, url): if os.path.exists(fname): raise Exception('{fname} already exists'.format(fname=fname)) r = http_do(url, 'GET', timeout=60 * 2) if r.reason != 'ok': raise internet_except('http_do GET fail: url = {}'.format(url)) write_bin(fname, r.data)
def get_the_exact_only_one_data_from_tree_list_node(node): #print(node) try: if node[-1]: data, = node[-1] data, = data data, = data assert type(data) == str else: data = '' except: raise internet_except(node) return data
def _get_info_from_html_txt(html_txt, match_to_list_root, match_to_info, \ get_data_from_tree_list_node): p = list_HTMLParser(match_to_list_root) p.feed(html_txt) page_info = get_data_attrs_from_tree_list(p.tree, match_to_info, get_data_from_tree_list_node) for slot_idx, one_matched_info_slot in enumerate(page_info): for pattern_idx, (matched_info_ls, info_pattern) in \ enumerate(zip(one_matched_info_slot, match_to_info)): try: for t2_or_t3 in matched_info_ls: # (tag, attrs) or (tag, attrs, data) yield t2_or_t3, (slot_idx, pattern_idx, info_pattern) except StopIteration: raise except GeneratorExit: raise except: raise internet_except('error: "for t2_or_t3 in matched_info_ls:"',\ one_matched_info_slot=one_matched_info_slot,\ matched_info_ls=matched_info_ls)
def _max_tag_match_split_under_some_assumption(max_int, non_zero_int_list): n = max_int ils = non_zero_int_list m = len(ils) for i in non_zero_int_list: assert -n <= i <= n and i != 0 ls = [0] * (n + 1) flags = [True] * (n + 1) # set flags for tag in ils: i = abs(tag) if not flags[i]: continue count = ls[i] if tag < 0: #close if count == 0: flags[i] = False else: ls[i] -= 1 else: #open ls[i] += 1 for i, count in enumerate(ls): if count: flags[i] = False # check consistence on the True tag #assume for simplify # build split_tags_ls # partial init match_map stack = [] split_tags_ls = [] split_idc_ls = [] split_tags_stack = [([], [])] match_map = [None] * m def split_tags_pop_to(): split_tags, split_idc = split_tags_stack.pop() if split_tags: split_tags_ls.append(split_tags) split_idc_ls.append(split_idc) for idx, tag in enumerate(ils): i = abs(tag) if not flags[i]: split_tags, split_idc = split_tags_stack[-1] split_tags.append(tag) split_idc.append(idx) continue if tag < 0: #close if not stack or stack[-1][1] != i: raise internet_except('assumption fail', 'tag non-consistence', ils) else: last = idx first, _ = stack.pop() assert match_map[first] == match_map[last] == None match_map[first] = last match_map[last] = first split_tags_pop_to() else: #open stack.append((idx, tag)) split_tags_stack.append(([], [])) split_tags_pop_to() assert not stack assert not split_tags_stack #utag2idx_buffer = flags for tags, idc in zip(split_tags_ls, split_idc_ls): #utags = (abs(tag) for tag in tags) #idx2utag = unify_with_integer_buffer(utags, utag2idx_buffer) #tags = tuple((1+utag2idx_buffer[abs(tag)])*sign(tag) for tag in tags)##### +1 ex_match_map = _max_tag_match_dynamic(tags) for i, match in enumerate(ex_match_map): assert match_map[idc[i]] == None if match != None: match_map[idc[i]] = idc[match] return match_map for i in range(m): if match_map[i] == None: match_map[i] = -1 return match_map