def extract_ctext_org(html_file, *, verbose: bool): # html_file -> (title, txt) # html_file may be BytesIO/StringIO? verbose = bool(verbose) soup = BeautifulSoup(html_file, 'lxml') title_soup = soup.find('meta', property="og:title") if title_soup is None: [form_soup] = soup.find_all('form', method='post') [text_field_soup] = form_soup.find_all('input', type='text') text_field_name = str(text_field_soup['name']) action_url = form_soup['action'] raise CTextOrgConfirmError(action_url=action_url, text_field_name=text_field_name) title = title_soup['content'] if verbose: print_err(f'title: {title!r}') #ver1: ctexts = soup.find_all(attrs={'class': 'ctext'}) #ls = [ctext.get_text().strip() # for ctext in ctexts # if not has_html_class(ctext, 'opt') # and assert_html_class_single(ctext) # ] #ver2: +<h2> pred = pred_ver2 get_text = get_text_ver2 ctexts = soup.find_all(pred) ls = [get_text(ctext) for ctext in ctexts] txt = '\n'.join(ls) if verbose: print_err('extract_ctext_org done!') return title, txt
def get(name): property = f"og:{name}" meta_soup = soup.find(name='meta', property=property) if meta_soup is None: raise ValueError(f'not found: {property!r}') value = str(meta_soup['content']) if verbose: print_err(f'{name}: {value!r}') return value
def _parse_非对应简繁汉字(非对应简繁汉字): # 一繁多简 12-1 # !!@:餘/余(馀) # !!@:摺(折)/折(摺) # ?? !!@:鑪/炉 # 一简多繁 280=43+64+46+43+84 # !!:庵/庵菴(厂广) # !!:仿佛/仿佛、彷彿、髣髴 # !!:干(乾)/干乾幹榦 # !!:夹(挟)/夾(夹)挾 # !!:菱/菱X 非对应简繁汉字 = 非对应简繁汉字.replace('X', '※') 非对应简繁汉字 = 非对应简繁汉字.replace('!!:仿佛/仿佛、彷彿、髣髴', r''' !!:仿/仿彷髣 !!:佛/佛彿髴 #''') #280 +1 txt = 非对应简繁汉字 s = set() # {(简,繁)} head = "!!@:" pattern = r"^(?P<一繁>[\w()]+)/(?P<多简>[\w()※]{2,})$" it = _iter_line_tails(head, txt) rex = re.compile(pattern) pairs = [] for tail in it: m = rex.fullmatch(tail) assert m is not None p = m["一繁"], m["多简"] pairs.append(p) assert len(pairs) == 12 - 1 for 一繁, 多简 in pairs: for 繁 in _remove_p(一繁): for 简 in _remove_p(多简): s.add((简, 繁)) #========= head = "!!:" pattern = r"^(?P<一简>[\w()]+)/(?P<多繁>[\w()※]{2,})$" it = _iter_line_tails(head, txt) rex = re.compile(pattern) pairs = [] for tail in it: m = rex.fullmatch(tail) assert m is not None p = m["一简"], m["多繁"] pairs.append(p) assert len(pairs) == 280 + 1 for 一简, 多繁 in pairs: for 简 in _remove_p(一简): for 繁 in _remove_p(多繁): if 繁 == "※": print_err(f"{简}->{繁}") s.add((简, 繁)) return s
def check_echo(type_, obj): try: if type(obj) is not type_: raise TypeError except TypeError: from seed.tiny import print_err print_err(type, repr(obj)) return obj
def bare_extract_ctext_org__url(self, url, *, verbose: bool, timeout, referrer, subcontents: bool, **kwargs): # url -> (title, txt) verbose = bool(verbose) try: if verbose: print_err(f'fetch webpage: {url!r}') page_bytes = self.fetch_webpage(url, timeout=timeout, referrer=referrer, **kwargs) if verbose: print_err(f'extracting webpage...: {url!r}') if not subcontents: title, txt = self.extract_ctext_org__text(page_bytes, verbose=verbose) result = title, txt else: ((title, url), subtitle_url_pairs) = self.extract_ctext_org__subcontents( page_bytes, verbose=verbose) subtitle_url_pairs = tuple(subtitle_url_pairs) result = (title, url), subtitle_url_pairs if verbose: print_err(f'extract webpage done: {url!r}') return result except (CTextOrgConfirmError, *TimeoutErrors): if verbose: print_err(f'extract webpage timeout: {url!r}') raise except Exception as e: if verbose: print_err(f'extract webpage error: {url!r}') raise Exception(f'url={url!r}', e)
def url2content(url, **kwargs): (content_type, content_data, url_info ) = download_file_from_url_ex(url, **kwargs) may_encoding = url_info.get_charset() #if may_encoding is None: raise Exception if may_encoding is None: encoding = 'utf8' else: encoding = may_encoding if type(encoding) is not str: raise Exception content = content_data.decode(encoding=encoding) return content with open_url(url, **kwargs) as response: url_info = response.info() content = url_info.get_payload(decode=True) #get_payload(): This is a legacy method. On the EmailMessage class its functionality is replaced by get_content() and iter_parts(). if url_info.is_multipart(): assert type(content) is list content = ''.join(content) try: assert type(content) is str except: print_err(type(content))#bytes??? print_err((content)) raise return content print(dir(url_info)) ##url_info :: HTTPMessage <: email.message.Message #file:///storage/emulated/0/0my_files/unzip/py_doc/python-3.8.1-docs-html/library/email.compat32-message.html#email.message.Message #['__bytes__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_charset', '_default_type', '_get_params_preserve', '_headers', '_payload', '_unixfrom', 'add_header', 'as_bytes', 'as_string', 'attach', 'defects', 'del_param', 'epilogue', 'get', 'get_all', 'get_boundary', 'get_charset', 'get_charsets', 'get_content_charset', 'get_content_disposition', 'get_content_maintype', 'get_content_subtype', 'get_content_type', 'get_default_type', 'get_filename', 'get_param', 'get_params', 'get_payload', 'get_unixfrom', 'getallmatchingheaders', 'is_multipart', 'items', 'keys', 'policy', 'preamble', 'raw_items', 'replace_header', 'set_boundary', 'set_charset', 'set_default_type', 'set_param', 'set_payload', 'set_raw', 'set_type', 'set_unixfrom', 'values', 'walk'] content = url_info.get_content() #AttributeError: 'HTTPMessage' object has no attribute 'get_content' return content
def show_session(session): return attrs = type(session).__attrs__ print_err('show_session') for attr in attrs: value = getattr(session, attr, None) print_err(f'\t{attr}={value!r}')
def extract_ctext_org__text(self, html_file, *, verbose: bool): soup = BeautifulSoup(html_file, 'lxml') verbose = bool(verbose) title = self.extract_ctext_org__step1_title(soup, verbose=verbose) txt = self.extract_ctext_org__step2_ctext(soup, verbose=verbose) if verbose: print_err('extract_ctext_org__text done!') return title, txt
def handle_exc(self, e): # requests.exceptions.ConnectionError print_err(repr(type(e))) print_err(repr(e)) if input('(input nothing to quit) >>>'): traceback.print_exc() return None else: raise
def handle_captcha_confirm(self, *, from_url, action_url, text_field_name, verbose, timeout, **kwargs): verbose = bool(verbose) if verbose: print_err( f'text_field_name={text_field_name!r}; action_url={action_url!r}; from_url={from_url!r}' ) while True: try: maybe_captcha_ex = self.ask_maybe_captcha_ex( title='ctext.org requires confirm', referrer=from_url, timeout=max(10, timeout), **kwargs) if not maybe_captcha_ex: raise KeyboardInterrupt [image_bytes, captcha] = maybe_captcha_ex if not captcha: raise logic - error if verbose: print_err(f'input captcha = {captcha!r}') data = self.post_captcha(action_url=action_url, text_field_name=text_field_name, referrer=from_url, captcha=captcha, timeout=timeout, **kwargs) try: self.extract_ctext_org__text(data, verbose=False) except CTextOrgConfirmError: # input wrong captcha self.save_captcha(image_bytes=image_bytes, captcha=captcha, correct=False) except Exception as e: # unknown error self.handle_exc(e) else: # input correct captcha self.save_captcha(image_bytes=image_bytes, captcha=captcha, correct=True) except KeyboardInterrupt: raise except Exception as e: self.handle_exc(e) continue else: break #end while return
def stable_repr_data(self, *, iter_eol_indents, has_head_eol_when_indent: bool): mk_it = lambda: self.stable_iter_repr_data( iter_eol_indents=iter_eol_indents, has_head_eol_when_indent=has_head_eol_when_indent) try: return ''.join(mk_it()) except Exception: from seed.tiny import print_err print_err(list(mk_it())) raise
def iter_ifiles(): for glob_pattern in glob_patterns: for path in glob.iglob(glob_pattern, recursive=recursive): print_err(f'glob_pattern: {glob_pattern!r} ==>> path:{path!r}') fname = os.path.basename(path) if fname.lower() in skipped_buggy_fname_set: print_err(f'\tskip:{fname!r}') continue else: with open(path, 'rt', encoding=encoding) as ifile: yield ifile
def unordered_iter_extract_ctext_org__referrer_url_pairs( self, referrer_url_pairs, *, verbose: bool, timeout, time_sep, **kwargs): # referrer_url_pairs :: [(referrer, url)] # -> Iter (referrer, url, title, txt) verbose = bool(verbose) referrer_url_pairs = list(referrer_url_pairs) while referrer_url_pairs: i = random.randrange(len(referrer_url_pairs)) referrer, url = referrer_url_pairs[i] if verbose: print_err( f'to fetch&extract webpage {url!r} from: {referrer!r}') if url not in self.cache: t = random.randrange(time_sep, 2 * time_sep) if verbose: print_err(f'sleep {t}s before fetch&extract webpage') time.sleep(t) try: title, txt = self.cached_extract_ctext_org__url( url, referrer=referrer, verbose=verbose, timeout=timeout, subcontents=False, **kwargs) except CTextOrgConfirmError as e: #input('ctext.org requires confirm') self.handle_captcha_confirm(from_url=url, action_url=e.action_url, text_field_name=e.text_field_name, verbose=verbose, timeout=timeout, **kwargs) continue except KeyboardInterrupt: raise except (Exception, OSError, *TimeoutErrors) as e: self.handle_exc(e) continue yield url, referrer, title, txt L = len(referrer_url_pairs) swap_to_last_and_pop(referrer_url_pairs, i) assert len(referrer_url_pairs) == L - 1
def unordered_iter_extract_ctext_org__url_rng(self, base_url, indices, index_format, *, verbose: bool, timeout, time_sep, **kwargs): # url -> begin -> end -> Iter (title, txt) #base_url = Path(base_url) verbose = bool(verbose) if verbose: print_err(f'fetch&extract webpages from: {base_url!r}') referrer_url_pairs = self.make_referrer_url_pairs__url_rng( base_url, indices, index_format) return self.unordered_iter_extract_ctext_org__referrer_url_pairs( referrer_url_pairs, verbose=verbose, timeout=timeout, time_sep=time_sep, **kwargs)
def _parse_非对称繁简字(非对称繁简字): # ¥—替换为 常见繁体字 # 95 # !!!:11、当(!當噹) # !!!:16、恶(~噁) # !!!:21、干(~幹乾) # 30 # !!!@:(110)、罗(!羅囉) # 39 # !!!&:1、呆(¥獃) # !!!&:15、泪(~淚)# # 28 # !!!$:16、凄(¥淒悽) # !!!$:26、伫(~佇竚)#? # 21?25?24 # !!!$:2、雕(!鵰凋彫) 非对称繁简字 txt = 非对称繁简字 pattern = r"^[^\s、]+、(?P<一简>\w)((?P<多繁>[!~]\w+))(?P<problem>(?:[#][?])?)$" head_count_pairs = [("!!!:", 95), ("!!!@:", 30), ("!!!&:", 39), ("!!!$:", 28 + 24)] rex = re.compile(pattern) all_pairs = [] for head, count in head_count_pairs: it = _iter_line_tails(head, txt) pairs = [] for tail in it: m = rex.fullmatch(tail) assert m is not None if m["problem"]: print_err(tail) p = m["一简"], m["多繁"] pairs.append(p) assert len(pairs) == count all_pairs += pairs pairs = all_pairs s = set() # {(简,繁)} for 简, 多繁 in pairs: z = 简 if 多繁[0] == '~' else '' 多繁 = z + 多繁[1:] assert len(多繁) >= 2 for 繁 in 多繁: s.add((简, 繁)) return s
def extract_ctext_org__step1_title(self, soup, *, verbose: bool): # html_file -> title # html_file may be BytesIO/StringIO? verbose = bool(verbose) title_soup = soup.find('meta', property="og:title") if title_soup is None: [form_soup] = soup.find_all('form', method='post') [text_field_soup] = form_soup.find_all('input', type='text') text_field_name = str(text_field_soup['name']) action_url = form_soup['action'] raise CTextOrgConfirmError(action_url=action_url, text_field_name=text_field_name) title = title_soup['content'] if verbose: print_err(f'title: {title!r}') return title
def is_relax_biconnected_ugraph_fake_embedding_relax_planar_ex(* ,relax_biconnected_ugraph_fake_embedding ,hedge2fake_counterclockwise_fface ): '''is relax_biconnected-ugraph_fake_embedding relax_planar? input: relax_biconnected_ugraph_fake_embedding # allow multiedge no self_loops vertex degree >= 2 for each connected component cc: * cc is an isolated vertex OR: * cc is biconnected cc.num_vertices >= 2 cc.num_aedges >= 2 output: (0, [[fface]]) | (1, (fface, fvertex)) | (2, (simple_cycle_hedges1, simple_path_hedges1)) (0, [[fface]]) relax_planar ffaces per connected component except isolated vertex ffaces is nonempty when merge ffaces in-order, the middle temp graphs are biconnected too i.e. avoid merge fface touch frontier_hedges exactly once touch(frontier_hedges, fface) = [(start_vertex, max_common_path_hedges0)] (1, (fface, fvertex)) non_relax_planar fface which visited a fvertex twices (2, (simple_cycle_hedges1, simple_path_hedges1)) non_relax_planar simple_cycle_hedges1 and simple_path_hedges1 are simple&nonempty simple_path_hedges1 may be a cycle when treat them as clockwise cycle/path: simple_path_hedges1 begin inside&on simple_cycle_hedges1 simple_path_hedges1 end outside&on simple_cycle_hedges1 ''' assert isinstance(relax_biconnected_ugraph_fake_embedding, UGraphFakeEmbedding) return _calc(relax_biconnected_ugraph_fake_embedding ,hedge2fake_counterclockwise_fface).calc_main() print_err(f'relax_biconnected_ugraph_fake_embedding={relax_biconnected_ugraph_fake_embedding}') print_err(f'hedge2fake_counterclockwise_fface={hedge2fake_counterclockwise_fface}')
def extract_ctext_org__url(url, *, verbose: bool, timeout, **kwargs): # url -> (title, txt) verbose = bool(verbose) try: if verbose: print_err(f'fetch webpage: {url!r}') page_bytes = fetch_webpage(url, timeout=timeout, **kwargs) if verbose: print_err(f'extracting webpage...: {url!r}') title, txt = extract_ctext_org(page_bytes, verbose=verbose) if verbose: print_err(f'extract webpage done: {url!r}') return title, txt except (CTextOrgConfirmError, *TimeoutErrors): if verbose: print_err(f'extract webpage timeout: {url!r}') raise except Exception as e: if verbose: print_err(f'extract webpage error: {url!r}') raise Exception(f'url={url!r}', e)
def __handle_classfiles( existing_classfile_paths_via_jarfile, existing_iqnames #, sorted_excluding_iqname_prefixes #, excluding_iqname_prefix_trie #, excluding_iqname_prefixes_regex , to_exclude, *, verbose: bool): oprint = make_print_on(verbose) oprint(existing_classfile_paths_via_jarfile) try: rough_class_infos = classfile_xpaths2rough_class_infos( [], existing_classfile_paths_via_jarfile) except: print_err( f'existing_classfile_paths_via_jarfile={existing_classfile_paths_via_jarfile!r}' ) raise known_iqnames = set(existing_iqnames) required_iqnames = set() excluded_iqnames = set() for (source_javafile_name, depended_iqnames) in rough_class_infos: oprint('\t', source_javafile_name) oprint('\t', depended_iqnames) for depended_iqname in depended_iqnames: if depended_iqname[0] in '["': print_err(f'depended_iqname={depended_iqname!r}') raise Exception #if depended_iqname in existing_iqnames: continue #if depended_iqname in required_iqnames: continue #if depended_iqname in excluded_iqnames: continue if depended_iqname in known_iqnames: continue known_iqnames.add(depended_iqname) if to_exclude(depended_iqname): excluded_iqnames.add(depended_iqname) else: required_iqnames.add(depended_iqname) return required_iqnames, excluded_iqnames
def make_all_rooted_utree_attrs(self, *, maybe_either_root): # no "utree"?? # maybe_either_root - see: aedge2maybe_upper_hedge() aedge2maybe_upper_hedge = self.aedge2maybe_upper_hedge( maybe_either_root=maybe_either_root) vertex2maybe_parent_aedge = self.vertex2maybe_parent_aedge( aedge2maybe_upper_hedge=aedge2maybe_upper_hedge) either_root = self.either_root( aedge2maybe_upper_hedge=aedge2maybe_upper_hedge, vertex2maybe_parent_aedge=vertex2maybe_parent_aedge) vertex2child_aedges = self.vertex2child_aedges( aedge2maybe_upper_hedge=aedge2maybe_upper_hedge, vertex2maybe_parent_aedge=vertex2maybe_parent_aedge) vertex2maybe_parent_vertex = self.vertex2maybe_parent_vertex( aedge2maybe_upper_hedge=aedge2maybe_upper_hedge, vertex2maybe_parent_aedge=vertex2maybe_parent_aedge) vertex2depth = self.vertex2depth( vertex2maybe_parent_vertex=vertex2maybe_parent_vertex) depth2vertices1 = self.depth2vertices1(vertex2depth=vertex2depth) depth2depth_idx2vertex = self.depth2depth_idx2vertex( depth2vertices1=depth2vertices1) vertex2depth_idx = self.vertex2depth_idx( depth2vertices1=depth2vertices1) ################ del self, maybe_either_root d = dict(locals()) # __class__ in locals??? for name in (frozenset(d) - __class__.all_attr_set): del d[name] try: assert frozenset(d) == __class__.all_attr_set except: from seed.tiny import print_err print_err(__class__.all_attr_set - frozenset(d)) print_err(frozenset(d) - __class__.all_attr_set) raise ns = ImmutableNamespace(**d) return ns
def __handle_complete_dot_idx(self, state): assert self.is_complete_state(state) #if state.dot_idx == 0: # direct nullable production_idx # return nonterminal_idx = self.production_idx2nonterminal_idx[ state.production_idx] begin = state.terminal_position_begin_of_production self.get_nonterminal_idx_begin_pair2complete_states().setdefault( (nonterminal_idx, begin), []).append(state) ### update first_prev_position should be before other put_state if begin == self.current_terminal_position: # direct/indirect null return prev_position = begin assert prev_position < self.current_terminal_position try: prev_states = self.terminal_position2nonterminal_idx2states[ prev_position][nonterminal_idx] except KeyError: if (prev_position == 0 and nonterminal_idx in self.start_nonterminal_idc): # start_nonterminal_idx donot have parent return print_err(f'prev_position = {prev_position}') print_err(f'nonterminal_idx = {nonterminal_idx}') print_err( f'terminal_position2nonterminal_idx2states[{prev_position}]={self.terminal_position2nonterminal_idx2states[prev_position]}' ) raise for prev_state in prev_states: self.put_forward_state(prev_state, prev_position) return
def maybe_parse_CHISE_IDS__line(line): # -> None|(char, may_char_ref, tree) # tree = (op, [arg]) | ('hz', char:char) | ('ref', ref_entity:str) # may_char_ref:None|str line = line.strip() if not line or line.startswith(';'): return None line = Globals.bugs.get(line, line) m = Globals.line_rex.fullmatch(line) if not m: raise Exception(f'unknown format: {line!r}') unicode = m['unicode'] assert unicode[0] == 'U' assert unicode[1] in '+-' order = int(unicode[2:], base=16) char_repr = m['char_repr'] payload = m['payload'] problem = m['problem'] if len(char_repr) == 1: char = char_repr may_char_ref = None if ord(char) != order: raise Exception(f'bad format: unicode not match hz-char: {line!r}') else: char = chr(order) char_ref = char_repr may_char_ref = char_ref try: tree = parse_CHISE_IDS__payload(payload) except Exception as e: raise Exception(e, line) if problem: print_err(f'{line!r}') return (char, may_char_ref, tree)
def post_captcha(self, *, referrer, action_url, text_field_name, captcha, timeout, **kwargs): data = {text_field_name: captcha} headers = self.make_headers(referrer=referrer) print_err(f'data={data}; headers={headers}') r = self.session.post(action_url, headers=headers, data=data, allow_redirects=True, timeout=timeout) show_session(self.session) data = r.content # read but discard return data return print_err(data) if input('(input yes to continue)>>>') != 'yes': raise KeyboardInterrupt return ######### fail self.fetch_webpage(action_url, referrer=referrer, data=data, timeout=timeout, **kwargs)
def parse_tokens(self, __tokens): calc = self.cfg.calc parser = BasicEarleyParser( production_idx2nonterminal_idx=calc.production_idx2nonterminal_idx, production_idx2idxalternative=calc.production_idx2idxalternative, nonterminal_idx2sorted_production_idc=calc. nonterminal_idx2sorted_production_idc, nonterminal_idx2is_nullable=calc.nonterminal_idx2is_nullable, start_nonterminal_idc=self.start_nonterminal_idc, token2terminal_name=self.token2terminal_name, terminal_set_ops=calc.terminal_set_ops, terminal_set_idx2terminal_set=calc.terminal_set_idx2terminal_set, nonterminal_idx2nonterminal_name=calc. nonterminal_idx2nonterminal_name, nonterminal_idx2maybe_one_null_tree=calc. nonterminal_idx2maybe_one_null_tree) try: for _ in map(parser.feed, __tokens): pass except: for attr, obj in vars(parser).items(): print_err(attr) print_err(' ' * 4, '=', obj) #for attr in dir(parser): print_err(attr, getattr(parser, attr)) raise if _show_rough_size_of_BasicEarleyParser: print_err('_show_rough_size_of_BasicEarleyParser:on') print_err(parser._get_rough_size()) cfg = self.cfg node = parser.extract_parse_main_tree( ambiguous_nonterminal_idc=self.ambiguous_nonterminal_idc, make_leaf_of_at=functools.partial(make_leaf_of_at, cfg, self.token2terminal_name), make_nonnull_nonleaf_of_between=functools.partial( make_nonnull_nonleaf_of_between, cfg), make_null_nonleaf_of_at=functools.partial(make_null_nonleaf_of_at, cfg)) return node
def fecth_captcha_image__bytes(self, *, referrer, timeout, **kwargs): i = random.randrange(Global.captcha_url_random_range) #print_err(f"random i for captcha_url_fmt: {i}={i:x}") image_url = Global.captcha_url_fmt.format(i) content_data = self.download_file_from_url(image_url, referrer=referrer, timeout=timeout, **kwargs) if False: print_err(f'captcha image content_type:{content_type!r}') print_err(type(content_data)) print_err(content_data) #assert content_type.lower().startswith('image') return content_data
def fecth_image__PIL(url, **kwargs): (content_type, content_data, url_info) = download_file_from_url_ex(url, **kwargs) if False: print_err(f'captcha image content_type:{content_type!r}') print_err(type(content_data)) print_err(content_data) assert content_type.lower().startswith('image') image_file = io.BytesIO(content_data) image_PIL = PIL.Image.open(image_file) #image_tk = PIL.ImageTk.PhotoImage(master=___donot_destroy_me, image=image_PIL) # RuntimeError('Too early to create image',) # see: ___donot_destroy_me return image_PIL
def cached_extract_ctext_org__url(self, url, *, verbose: bool, timeout, referrer, subcontents: bool, **kwargs): # url -> (title, txt) verbose = bool(verbose) subcontents = bool(subcontents) fetch = lambda: self.bare_extract_ctext_org__url(url, referrer=referrer, verbose=verbose, timeout=timeout, subcontents= subcontents, **kwargs) if not subcontents: key = url else: key = '[subcontents]' + url def result2title(result): if not subcontents: title, txt = result else: ((title, _url), subtitle_url_pairs) = result return title if verbose: str_may_subcontents = '[subcontents]' if subcontents else '' if key not in self.cache: result = fetch() self.cache[key] = result title = result2title(result) if verbose: print_err(f'store title{str_may_subcontents!s}: {title!r}') if verbose: print_err(f'read cached webpage: {url!r}') result = self.cache[key] title = result2title(result) if verbose: print_err(f'read title{str_may_subcontents!s}: {title!r}') return result
def _init_subclass4StructBase_(cls): if inspect.isabstract(cls): return impl_attr_seq = tuple(cls.__iter_all_impl_attrs__()) cls.__all_impl_attr_set__ = frozenset(impl_attr_seq) cached_attr_calc_pair_seq = tuple( cls.__iter_all_cached_attr_calc_pairs__()) cls.__cached_attr2calc__ = MappingProxyType( dict(cached_attr_calc_pair_seq)) try: if len(cls.__all_impl_attr_set__) != len(impl_attr_seq): raise TypeError except: print_err(cls.__all_impl_attr_set__) print_err(impl_attr_seq) from seed.iters.duplicate_elements import find_duplicate_element_groups print_err(find_duplicate_element_groups(impl_attr_seq)) raise if len(cls.__cached_attr2calc__) != len(cached_attr_calc_pair_seq): raise TypeError cls.__all_primekey_attr_seq__ = tuple( cls.__iter_all_primekey_attrs__()) cls.__all_user_attr_seq__ = tuple(cls.__iter_all_user_attrs__())
def iter_extract_ctext_org__url_rng(base_url, indices, index_format, *, verbose: bool, timeout, time_sep, **kwargs): # url -> begin -> end -> Iter (title, txt) #base_url = Path(base_url) verbose = bool(verbose) if verbose: print_err(f'fetch&extract webpages from: {base_url!r}') if base_url[-1:] == '/': base_url = base_url[:-1] referrer = base_url base_fmt = f'{base_url}/{index_format}' for i in indices: #str_i = str(i) #url = base_url / str_i; url = str(url) #url = os.path.join(base_url, str_i) #url = f'{base_url}/{i}' url = base_fmt.format(i) #print(url) while True: t = random.randrange(time_sep, 2 * time_sep) if verbose: print_err( f'sleep {t}s before fetch&extract webpages from: {base_url!r}' ) time.sleep(t) try: title, txt = extract_ctext_org__url(url, verbose=verbose, timeout=timeout, referrer=referrer, **kwargs) except CTextOrgConfirmError as e: #input('ctext.org requires confirm') action_url = e.action_url text_field_name = e.text_field_name while True: try: captcha = ask_captcha( title='ctext.org requires confirm', referrer=url, timeout=max(10, timeout)) post_captcha(action_url=action_url, text_field_name=text_field_name, referrer=url, captcha=captcha, timeout=timeout) except KeyboardInterrupt: raise except Exception as e2: print_err(repr(e2)) if input('>>>'): traceback.print_exc() continue else: raise e2 else: break continue except KeyboardInterrupt: raise except (Exception, OSError, *TimeoutErrors) as e: print_err(repr(e)) continue break yield title, txt
def main(args=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description='count identifiers', epilog='', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-i', '--input', type=str, default=None, help='input file path') parser.add_argument('-g', '--glob_pattern', type=str, default=None, help='treat <input> as folder path') parser.add_argument('-o', '--output', type=str, default=None, help='output file path') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') args = parser.parse_args(args) encoding = args.encoding omode = 'wt' if args.force else 'xt' d = {} def f(fin): for line in fin: feed(d, line) may_glob_pattern = args.glob_pattern if may_glob_pattern is None: may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: f(fin) else: glob_pattern = may_glob_pattern may_root = args.input root = '.' if not may_root else may_root for path in iter_files(root, glob_pattern): try: with open(path, 'rt', encoding=encoding) as fin: f(fin) except UnicodeDecodeError: print_err(path) continue except: print_err(path) raise ls = lst(d) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: show(fout, ls)