def extract_ctext_org(html_file, *, verbose: bool):
    # html_file -> (title, txt)
    # html_file may be BytesIO/StringIO?
    verbose = bool(verbose)

    soup = BeautifulSoup(html_file, 'lxml')
    title_soup = soup.find('meta', property="og:title")
    if title_soup is None:
        [form_soup] = soup.find_all('form', method='post')
        [text_field_soup] = form_soup.find_all('input', type='text')
        text_field_name = str(text_field_soup['name'])
        action_url = form_soup['action']
        raise CTextOrgConfirmError(action_url=action_url,
                                   text_field_name=text_field_name)

    title = title_soup['content']
    if verbose: print_err(f'title: {title!r}')

    #ver1: ctexts = soup.find_all(attrs={'class': 'ctext'})
    #ls = [ctext.get_text().strip()
    #       for ctext in ctexts
    #       if not has_html_class(ctext, 'opt')
    #           and assert_html_class_single(ctext)
    #   ]
    #ver2: +<h2>
    pred = pred_ver2
    get_text = get_text_ver2
    ctexts = soup.find_all(pred)
    ls = [get_text(ctext) for ctext in ctexts]
    txt = '\n'.join(ls)

    if verbose: print_err('extract_ctext_org done!')
    return title, txt
 def get(name):
     property = f"og:{name}"
     meta_soup = soup.find(name='meta', property=property)
     if meta_soup is None: raise ValueError(f'not found: {property!r}')
     value = str(meta_soup['content'])
     if verbose: print_err(f'{name}: {value!r}')
     return value
Example #3
0
def _parse_非对应简繁汉字(非对应简繁汉字):
    # 一繁多简 12-1
    # !!@:餘/余(馀)
    # !!@:摺(折)/折(摺)
    # ?? !!@:鑪/炉
    # 一简多繁 280=43+64+46+43+84
    # !!:庵/庵菴(厂广)
    # !!:仿佛/仿佛、彷彿、髣髴
    # !!:干(乾)/干乾幹榦
    # !!:夹(挟)/夾(夹)挾
    # !!:菱/菱X
    非对应简繁汉字 = 非对应简繁汉字.replace('X', '※')
    非对应简繁汉字 = 非对应简繁汉字.replace('!!:仿佛/仿佛、彷彿、髣髴', r'''
!!:仿/仿彷髣
!!:佛/佛彿髴
        #''')  #280 +1

    txt = 非对应简繁汉字
    s = set()  # {(简,繁)}

    head = "!!@:"
    pattern = r"^(?P<一繁>[\w()]+)/(?P<多简>[\w()※]{2,})$"

    it = _iter_line_tails(head, txt)
    rex = re.compile(pattern)
    pairs = []
    for tail in it:
        m = rex.fullmatch(tail)
        assert m is not None
        p = m["一繁"], m["多简"]
        pairs.append(p)
    assert len(pairs) == 12 - 1

    for 一繁, 多简 in pairs:
        for 繁 in _remove_p(一繁):
            for 简 in _remove_p(多简):
                s.add((简, 繁))

    #=========
    head = "!!:"
    pattern = r"^(?P<一简>[\w()]+)/(?P<多繁>[\w()※]{2,})$"

    it = _iter_line_tails(head, txt)
    rex = re.compile(pattern)
    pairs = []
    for tail in it:
        m = rex.fullmatch(tail)
        assert m is not None
        p = m["一简"], m["多繁"]
        pairs.append(p)
    assert len(pairs) == 280 + 1

    for 一简, 多繁 in pairs:
        for 简 in _remove_p(一简):
            for 繁 in _remove_p(多繁):
                if 繁 == "※":
                    print_err(f"{简}->{繁}")
                s.add((简, 繁))

    return s
Example #4
0
def check_echo(type_, obj):
    try:
        if type(obj) is not type_: raise TypeError
    except TypeError:
        from seed.tiny import print_err
        print_err(type, repr(obj))
    return obj
    def bare_extract_ctext_org__url(self, url, *, verbose: bool, timeout,
                                    referrer, subcontents: bool, **kwargs):
        # url -> (title, txt)
        verbose = bool(verbose)
        try:
            if verbose: print_err(f'fetch webpage: {url!r}')
            page_bytes = self.fetch_webpage(url,
                                            timeout=timeout,
                                            referrer=referrer,
                                            **kwargs)

            if verbose: print_err(f'extracting webpage...: {url!r}')
            if not subcontents:
                title, txt = self.extract_ctext_org__text(page_bytes,
                                                          verbose=verbose)
                result = title, txt
            else:
                ((title, url),
                 subtitle_url_pairs) = self.extract_ctext_org__subcontents(
                     page_bytes, verbose=verbose)
                subtitle_url_pairs = tuple(subtitle_url_pairs)
                result = (title, url), subtitle_url_pairs
            if verbose: print_err(f'extract webpage done: {url!r}')
            return result
        except (CTextOrgConfirmError, *TimeoutErrors):
            if verbose: print_err(f'extract webpage timeout: {url!r}')
            raise
        except Exception as e:
            if verbose: print_err(f'extract webpage error: {url!r}')
            raise Exception(f'url={url!r}', e)
Example #6
0
def url2content(url, **kwargs):
    (content_type, content_data, url_info
    ) = download_file_from_url_ex(url, **kwargs)
    may_encoding = url_info.get_charset()
    #if may_encoding is None: raise Exception
    if may_encoding is None:
        encoding = 'utf8'
    else:
        encoding = may_encoding
    if type(encoding) is not str: raise Exception
    content = content_data.decode(encoding=encoding)
    return content
    with open_url(url, **kwargs) as response:
        url_info = response.info()
        content = url_info.get_payload(decode=True)
            #get_payload(): This is a legacy method. On the EmailMessage class its functionality is replaced by get_content() and iter_parts().
        if url_info.is_multipart():
            assert type(content) is list
            content = ''.join(content)
        try:
            assert type(content) is str
        except:
            print_err(type(content))#bytes???
            print_err((content))
            raise
        return content
        print(dir(url_info))
            ##url_info :: HTTPMessage <: email.message.Message
            #file:///storage/emulated/0/0my_files/unzip/py_doc/python-3.8.1-docs-html/library/email.compat32-message.html#email.message.Message
            #['__bytes__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_charset', '_default_type', '_get_params_preserve', '_headers', '_payload', '_unixfrom', 'add_header', 'as_bytes', 'as_string', 'attach', 'defects', 'del_param', 'epilogue', 'get', 'get_all', 'get_boundary', 'get_charset', 'get_charsets', 'get_content_charset', 'get_content_disposition', 'get_content_maintype', 'get_content_subtype', 'get_content_type', 'get_default_type', 'get_filename', 'get_param', 'get_params', 'get_payload', 'get_unixfrom', 'getallmatchingheaders', 'is_multipart', 'items', 'keys', 'policy', 'preamble', 'raw_items', 'replace_header', 'set_boundary', 'set_charset', 'set_default_type', 'set_param', 'set_payload', 'set_raw', 'set_type', 'set_unixfrom', 'values', 'walk']
        content = url_info.get_content()
            #AttributeError: 'HTTPMessage' object has no attribute 'get_content'
        return content
def show_session(session):
    return
    attrs = type(session).__attrs__
    print_err('show_session')
    for attr in attrs:
        value = getattr(session, attr, None)
        print_err(f'\t{attr}={value!r}')
    def extract_ctext_org__text(self, html_file, *, verbose: bool):
        soup = BeautifulSoup(html_file, 'lxml')
        verbose = bool(verbose)

        title = self.extract_ctext_org__step1_title(soup, verbose=verbose)
        txt = self.extract_ctext_org__step2_ctext(soup, verbose=verbose)
        if verbose: print_err('extract_ctext_org__text done!')
        return title, txt
 def handle_exc(self, e):
     # requests.exceptions.ConnectionError
     print_err(repr(type(e)))
     print_err(repr(e))
     if input('(input nothing to quit) >>>'):
         traceback.print_exc()
         return None
     else:
         raise
    def handle_captcha_confirm(self, *, from_url, action_url, text_field_name,
                               verbose, timeout, **kwargs):
        verbose = bool(verbose)

        if verbose:
            print_err(
                f'text_field_name={text_field_name!r}; action_url={action_url!r}; from_url={from_url!r}'
            )

        while True:
            try:
                maybe_captcha_ex = self.ask_maybe_captcha_ex(
                    title='ctext.org requires confirm',
                    referrer=from_url,
                    timeout=max(10, timeout),
                    **kwargs)
                if not maybe_captcha_ex:
                    raise KeyboardInterrupt
                [image_bytes, captcha] = maybe_captcha_ex

                if not captcha: raise logic - error
                if verbose: print_err(f'input captcha = {captcha!r}')
                data = self.post_captcha(action_url=action_url,
                                         text_field_name=text_field_name,
                                         referrer=from_url,
                                         captcha=captcha,
                                         timeout=timeout,
                                         **kwargs)

                try:
                    self.extract_ctext_org__text(data, verbose=False)
                except CTextOrgConfirmError:
                    # input wrong captcha
                    self.save_captcha(image_bytes=image_bytes,
                                      captcha=captcha,
                                      correct=False)
                except Exception as e:
                    # unknown error
                    self.handle_exc(e)
                else:
                    # input correct captcha
                    self.save_captcha(image_bytes=image_bytes,
                                      captcha=captcha,
                                      correct=True)

            except KeyboardInterrupt:
                raise
            except Exception as e:
                self.handle_exc(e)
                continue
            else:
                break
        #end while
        return
Example #11
0
 def stable_repr_data(self, *, iter_eol_indents,
                      has_head_eol_when_indent: bool):
     mk_it = lambda: self.stable_iter_repr_data(
         iter_eol_indents=iter_eol_indents,
         has_head_eol_when_indent=has_head_eol_when_indent)
     try:
         return ''.join(mk_it())
     except Exception:
         from seed.tiny import print_err
         print_err(list(mk_it()))
         raise
Example #12
0
 def iter_ifiles():
     for glob_pattern in glob_patterns:
         for path in glob.iglob(glob_pattern, recursive=recursive):
             print_err(f'glob_pattern: {glob_pattern!r} ==>> path:{path!r}')
             fname = os.path.basename(path)
             if fname.lower() in skipped_buggy_fname_set:
                 print_err(f'\tskip:{fname!r}')
                 continue
             else:
                 with open(path, 'rt', encoding=encoding) as ifile:
                     yield ifile
    def unordered_iter_extract_ctext_org__referrer_url_pairs(
            self, referrer_url_pairs, *, verbose: bool, timeout, time_sep,
            **kwargs):
        # referrer_url_pairs :: [(referrer, url)]
        # -> Iter (referrer, url, title, txt)
        verbose = bool(verbose)
        referrer_url_pairs = list(referrer_url_pairs)

        while referrer_url_pairs:
            i = random.randrange(len(referrer_url_pairs))
            referrer, url = referrer_url_pairs[i]
            if verbose:
                print_err(
                    f'to fetch&extract webpage {url!r} from: {referrer!r}')

            if url not in self.cache:
                t = random.randrange(time_sep, 2 * time_sep)
                if verbose:
                    print_err(f'sleep {t}s before fetch&extract webpage')
                time.sleep(t)

            try:
                title, txt = self.cached_extract_ctext_org__url(
                    url,
                    referrer=referrer,
                    verbose=verbose,
                    timeout=timeout,
                    subcontents=False,
                    **kwargs)
            except CTextOrgConfirmError as e:
                #input('ctext.org requires confirm')
                self.handle_captcha_confirm(from_url=url,
                                            action_url=e.action_url,
                                            text_field_name=e.text_field_name,
                                            verbose=verbose,
                                            timeout=timeout,
                                            **kwargs)
                continue
            except KeyboardInterrupt:
                raise
            except (Exception, OSError, *TimeoutErrors) as e:
                self.handle_exc(e)
                continue
            yield url, referrer, title, txt

            L = len(referrer_url_pairs)
            swap_to_last_and_pop(referrer_url_pairs, i)
            assert len(referrer_url_pairs) == L - 1
 def unordered_iter_extract_ctext_org__url_rng(self, base_url, indices,
                                               index_format, *,
                                               verbose: bool, timeout,
                                               time_sep, **kwargs):
     # url -> begin -> end -> Iter (title, txt)
     #base_url = Path(base_url)
     verbose = bool(verbose)
     if verbose: print_err(f'fetch&extract webpages from: {base_url!r}')
     referrer_url_pairs = self.make_referrer_url_pairs__url_rng(
         base_url, indices, index_format)
     return self.unordered_iter_extract_ctext_org__referrer_url_pairs(
         referrer_url_pairs,
         verbose=verbose,
         timeout=timeout,
         time_sep=time_sep,
         **kwargs)
Example #15
0
def _parse_非对称繁简字(非对称繁简字):
    # ¥—替换为 常见繁体字
    # 95
    # !!!:11、当(!當噹)
    # !!!:16、恶(~噁)
    # !!!:21、干(~幹乾)
    # 30
    # !!!@:(110)、罗(!羅囉)
    # 39
    # !!!&:1、呆(¥獃)
    # !!!&:15、泪(~淚)#
    # 28
    # !!!$:16、凄(¥淒悽)
    # !!!$:26、伫(~佇竚)#?
    # 21?25?24
    # !!!$:2、雕(!鵰凋彫)
    非对称繁简字
    txt = 非对称繁简字
    pattern = r"^[^\s、]+、(?P<一简>\w)((?P<多繁>[!~]\w+))(?P<problem>(?:[#][?])?)$"
    head_count_pairs = [("!!!:", 95), ("!!!@:", 30), ("!!!&:", 39),
                        ("!!!$:", 28 + 24)]

    rex = re.compile(pattern)
    all_pairs = []
    for head, count in head_count_pairs:
        it = _iter_line_tails(head, txt)
        pairs = []
        for tail in it:
            m = rex.fullmatch(tail)
            assert m is not None
            if m["problem"]:
                print_err(tail)
            p = m["一简"], m["多繁"]
            pairs.append(p)
        assert len(pairs) == count
        all_pairs += pairs

    pairs = all_pairs
    s = set()  # {(简,繁)}
    for 简, 多繁 in pairs:
        z = 简 if 多繁[0] == '~' else ''
        多繁 = z + 多繁[1:]
        assert len(多繁) >= 2
        for 繁 in 多繁:
            s.add((简, 繁))
    return s
    def extract_ctext_org__step1_title(self, soup, *, verbose: bool):
        # html_file -> title
        # html_file may be BytesIO/StringIO?
        verbose = bool(verbose)

        title_soup = soup.find('meta', property="og:title")
        if title_soup is None:
            [form_soup] = soup.find_all('form', method='post')
            [text_field_soup] = form_soup.find_all('input', type='text')
            text_field_name = str(text_field_soup['name'])
            action_url = form_soup['action']
            raise CTextOrgConfirmError(action_url=action_url,
                                       text_field_name=text_field_name)

        title = title_soup['content']
        if verbose: print_err(f'title: {title!r}')
        return title
def is_relax_biconnected_ugraph_fake_embedding_relax_planar_ex(*
    ,relax_biconnected_ugraph_fake_embedding
    ,hedge2fake_counterclockwise_fface
    ):
    '''is relax_biconnected-ugraph_fake_embedding relax_planar?

input:
    relax_biconnected_ugraph_fake_embedding
        # allow multiedge
        no self_loops
        vertex degree >= 2
        for each connected component cc:
            * cc is an isolated vertex
            OR:
            * cc is biconnected
                cc.num_vertices >= 2
                cc.num_aedges >= 2
output:
    (0, [[fface]]) | (1, (fface, fvertex)) | (2, (simple_cycle_hedges1, simple_path_hedges1))
        (0, [[fface]])
            relax_planar
            ffaces per connected component except isolated vertex
                ffaces is nonempty
                when merge ffaces in-order, the middle temp graphs are biconnected too
                    i.e. avoid merge fface touch frontier_hedges exactly once
                touch(frontier_hedges, fface) = [(start_vertex, max_common_path_hedges0)]

        (1, (fface, fvertex))
            non_relax_planar
            fface which visited a fvertex twices

        (2, (simple_cycle_hedges1, simple_path_hedges1))
            non_relax_planar
            simple_cycle_hedges1 and simple_path_hedges1 are simple&nonempty
                simple_path_hedges1 may be a cycle
            when treat them as clockwise cycle/path:
                simple_path_hedges1 begin inside&on simple_cycle_hedges1
                simple_path_hedges1 end outside&on simple_cycle_hedges1

'''
    assert isinstance(relax_biconnected_ugraph_fake_embedding, UGraphFakeEmbedding)
    return _calc(relax_biconnected_ugraph_fake_embedding
                ,hedge2fake_counterclockwise_fface).calc_main()
    print_err(f'relax_biconnected_ugraph_fake_embedding={relax_biconnected_ugraph_fake_embedding}')
    print_err(f'hedge2fake_counterclockwise_fface={hedge2fake_counterclockwise_fface}')
def extract_ctext_org__url(url, *, verbose: bool, timeout, **kwargs):
    # url -> (title, txt)
    verbose = bool(verbose)
    try:
        if verbose: print_err(f'fetch webpage: {url!r}')
        page_bytes = fetch_webpage(url, timeout=timeout, **kwargs)

        if verbose: print_err(f'extracting webpage...: {url!r}')
        title, txt = extract_ctext_org(page_bytes, verbose=verbose)

        if verbose: print_err(f'extract webpage done: {url!r}')
        return title, txt
    except (CTextOrgConfirmError, *TimeoutErrors):
        if verbose: print_err(f'extract webpage timeout: {url!r}')
        raise
    except Exception as e:
        if verbose: print_err(f'extract webpage error: {url!r}')
        raise Exception(f'url={url!r}', e)
Example #19
0
def __handle_classfiles(
        existing_classfile_paths_via_jarfile,
        existing_iqnames
    #, sorted_excluding_iqname_prefixes
    #, excluding_iqname_prefix_trie
    #, excluding_iqname_prefixes_regex
    ,
        to_exclude,
        *,
        verbose: bool):
    oprint = make_print_on(verbose)

    oprint(existing_classfile_paths_via_jarfile)
    try:
        rough_class_infos = classfile_xpaths2rough_class_infos(
            [], existing_classfile_paths_via_jarfile)
    except:
        print_err(
            f'existing_classfile_paths_via_jarfile={existing_classfile_paths_via_jarfile!r}'
        )
        raise

    known_iqnames = set(existing_iqnames)
    required_iqnames = set()
    excluded_iqnames = set()
    for (source_javafile_name, depended_iqnames) in rough_class_infos:
        oprint('\t', source_javafile_name)
        oprint('\t', depended_iqnames)

        for depended_iqname in depended_iqnames:
            if depended_iqname[0] in '["':
                print_err(f'depended_iqname={depended_iqname!r}')
                raise Exception
            #if depended_iqname in existing_iqnames: continue
            #if depended_iqname in required_iqnames: continue
            #if depended_iqname in excluded_iqnames: continue
            if depended_iqname in known_iqnames: continue
            known_iqnames.add(depended_iqname)

            if to_exclude(depended_iqname):
                excluded_iqnames.add(depended_iqname)
            else:
                required_iqnames.add(depended_iqname)
    return required_iqnames, excluded_iqnames
Example #20
0
    def make_all_rooted_utree_attrs(self, *, maybe_either_root):
        # no "utree"??
        # maybe_either_root - see: aedge2maybe_upper_hedge()
        aedge2maybe_upper_hedge = self.aedge2maybe_upper_hedge(
            maybe_either_root=maybe_either_root)
        vertex2maybe_parent_aedge = self.vertex2maybe_parent_aedge(
            aedge2maybe_upper_hedge=aedge2maybe_upper_hedge)
        either_root = self.either_root(
            aedge2maybe_upper_hedge=aedge2maybe_upper_hedge,
            vertex2maybe_parent_aedge=vertex2maybe_parent_aedge)
        vertex2child_aedges = self.vertex2child_aedges(
            aedge2maybe_upper_hedge=aedge2maybe_upper_hedge,
            vertex2maybe_parent_aedge=vertex2maybe_parent_aedge)
        vertex2maybe_parent_vertex = self.vertex2maybe_parent_vertex(
            aedge2maybe_upper_hedge=aedge2maybe_upper_hedge,
            vertex2maybe_parent_aedge=vertex2maybe_parent_aedge)
        vertex2depth = self.vertex2depth(
            vertex2maybe_parent_vertex=vertex2maybe_parent_vertex)
        depth2vertices1 = self.depth2vertices1(vertex2depth=vertex2depth)
        depth2depth_idx2vertex = self.depth2depth_idx2vertex(
            depth2vertices1=depth2vertices1)
        vertex2depth_idx = self.vertex2depth_idx(
            depth2vertices1=depth2vertices1)

        ################
        del self, maybe_either_root
        d = dict(locals())  # __class__ in locals???
        for name in (frozenset(d) - __class__.all_attr_set):
            del d[name]

        try:
            assert frozenset(d) == __class__.all_attr_set
        except:
            from seed.tiny import print_err
            print_err(__class__.all_attr_set - frozenset(d))
            print_err(frozenset(d) - __class__.all_attr_set)
            raise

        ns = ImmutableNamespace(**d)
        return ns
Example #21
0
    def __handle_complete_dot_idx(self, state):
        assert self.is_complete_state(state)
        #if state.dot_idx == 0:
        # direct nullable production_idx
        #   return
        nonterminal_idx = self.production_idx2nonterminal_idx[
            state.production_idx]
        begin = state.terminal_position_begin_of_production
        self.get_nonterminal_idx_begin_pair2complete_states().setdefault(
            (nonterminal_idx, begin), []).append(state)
        ### update first_prev_position should be before other put_state

        if begin == self.current_terminal_position:
            # direct/indirect null
            return
        prev_position = begin
        assert prev_position < self.current_terminal_position
        try:
            prev_states = self.terminal_position2nonterminal_idx2states[
                prev_position][nonterminal_idx]
        except KeyError:
            if (prev_position == 0
                    and nonterminal_idx in self.start_nonterminal_idc):
                # start_nonterminal_idx donot have parent
                return
            print_err(f'prev_position = {prev_position}')
            print_err(f'nonterminal_idx = {nonterminal_idx}')
            print_err(
                f'terminal_position2nonterminal_idx2states[{prev_position}]={self.terminal_position2nonterminal_idx2states[prev_position]}'
            )
            raise
        for prev_state in prev_states:
            self.put_forward_state(prev_state, prev_position)
        return
Example #22
0
def maybe_parse_CHISE_IDS__line(line):
    # -> None|(char, may_char_ref, tree)
    # tree = (op, [arg]) | ('hz', char:char) | ('ref', ref_entity:str)
    # may_char_ref:None|str
    line = line.strip()
    if not line or line.startswith(';'):
        return None
    line = Globals.bugs.get(line, line)

    m = Globals.line_rex.fullmatch(line)

    if not m:
        raise Exception(f'unknown format: {line!r}')
    unicode = m['unicode']
    assert unicode[0] == 'U'
    assert unicode[1] in '+-'
    order = int(unicode[2:], base=16)
    char_repr = m['char_repr']
    payload = m['payload']
    problem = m['problem']

    if len(char_repr) == 1:
        char = char_repr
        may_char_ref = None
        if ord(char) != order:
            raise Exception(f'bad format: unicode not match hz-char: {line!r}')
    else:
        char = chr(order)
        char_ref = char_repr
        may_char_ref = char_ref

    try:
        tree = parse_CHISE_IDS__payload(payload)
    except Exception as e:
        raise Exception(e, line)
    if problem: print_err(f'{line!r}')
    return (char, may_char_ref, tree)
 def post_captcha(self, *, referrer, action_url, text_field_name, captcha,
                  timeout, **kwargs):
     data = {text_field_name: captcha}
     headers = self.make_headers(referrer=referrer)
     print_err(f'data={data}; headers={headers}')
     r = self.session.post(action_url,
                           headers=headers,
                           data=data,
                           allow_redirects=True,
                           timeout=timeout)
     show_session(self.session)
     data = r.content  # read but discard
     return data
     return
     print_err(data)
     if input('(input yes to continue)>>>') != 'yes':
         raise KeyboardInterrupt
     return
     ######### fail
     self.fetch_webpage(action_url,
                        referrer=referrer,
                        data=data,
                        timeout=timeout,
                        **kwargs)
Example #24
0
    def parse_tokens(self, __tokens):
        calc = self.cfg.calc
        parser = BasicEarleyParser(
            production_idx2nonterminal_idx=calc.production_idx2nonterminal_idx,
            production_idx2idxalternative=calc.production_idx2idxalternative,
            nonterminal_idx2sorted_production_idc=calc.
            nonterminal_idx2sorted_production_idc,
            nonterminal_idx2is_nullable=calc.nonterminal_idx2is_nullable,
            start_nonterminal_idc=self.start_nonterminal_idc,
            token2terminal_name=self.token2terminal_name,
            terminal_set_ops=calc.terminal_set_ops,
            terminal_set_idx2terminal_set=calc.terminal_set_idx2terminal_set,
            nonterminal_idx2nonterminal_name=calc.
            nonterminal_idx2nonterminal_name,
            nonterminal_idx2maybe_one_null_tree=calc.
            nonterminal_idx2maybe_one_null_tree)

        try:
            for _ in map(parser.feed, __tokens):
                pass
        except:
            for attr, obj in vars(parser).items():
                print_err(attr)
                print_err(' ' * 4, '=', obj)
            #for attr in dir(parser): print_err(attr, getattr(parser, attr))
            raise
        if _show_rough_size_of_BasicEarleyParser:
            print_err('_show_rough_size_of_BasicEarleyParser:on')
            print_err(parser._get_rough_size())

        cfg = self.cfg
        node = parser.extract_parse_main_tree(
            ambiguous_nonterminal_idc=self.ambiguous_nonterminal_idc,
            make_leaf_of_at=functools.partial(make_leaf_of_at, cfg,
                                              self.token2terminal_name),
            make_nonnull_nonleaf_of_between=functools.partial(
                make_nonnull_nonleaf_of_between, cfg),
            make_null_nonleaf_of_at=functools.partial(make_null_nonleaf_of_at,
                                                      cfg))
        return node
    def fecth_captcha_image__bytes(self, *, referrer, timeout, **kwargs):
        i = random.randrange(Global.captcha_url_random_range)
        #print_err(f"random i for captcha_url_fmt: {i}={i:x}")
        image_url = Global.captcha_url_fmt.format(i)
        content_data = self.download_file_from_url(image_url,
                                                   referrer=referrer,
                                                   timeout=timeout,
                                                   **kwargs)

        if False:
            print_err(f'captcha image content_type:{content_type!r}')
            print_err(type(content_data))
            print_err(content_data)
            #assert content_type.lower().startswith('image')
        return content_data
def fecth_image__PIL(url, **kwargs):
    (content_type, content_data,
     url_info) = download_file_from_url_ex(url, **kwargs)

    if False:
        print_err(f'captcha image content_type:{content_type!r}')
        print_err(type(content_data))
        print_err(content_data)

    assert content_type.lower().startswith('image')
    image_file = io.BytesIO(content_data)
    image_PIL = PIL.Image.open(image_file)
    #image_tk = PIL.ImageTk.PhotoImage(master=___donot_destroy_me, image=image_PIL)
    # RuntimeError('Too early to create image',)
    # see: ___donot_destroy_me
    return image_PIL
    def cached_extract_ctext_org__url(self, url, *, verbose: bool, timeout,
                                      referrer, subcontents: bool, **kwargs):
        # url -> (title, txt)
        verbose = bool(verbose)
        subcontents = bool(subcontents)

        fetch = lambda: self.bare_extract_ctext_org__url(url,
                                                         referrer=referrer,
                                                         verbose=verbose,
                                                         timeout=timeout,
                                                         subcontents=
                                                         subcontents,
                                                         **kwargs)
        if not subcontents:
            key = url
        else:
            key = '[subcontents]' + url

        def result2title(result):
            if not subcontents:
                title, txt = result
            else:
                ((title, _url), subtitle_url_pairs) = result
            return title

        if verbose:
            str_may_subcontents = '[subcontents]' if subcontents else ''
        if key not in self.cache:
            result = fetch()
            self.cache[key] = result
            title = result2title(result)

            if verbose:
                print_err(f'store title{str_may_subcontents!s}: {title!r}')

        if verbose: print_err(f'read cached webpage: {url!r}')
        result = self.cache[key]
        title = result2title(result)
        if verbose: print_err(f'read title{str_may_subcontents!s}: {title!r}')
        return result
Example #28
0
    def _init_subclass4StructBase_(cls):
        if inspect.isabstract(cls): return
        impl_attr_seq = tuple(cls.__iter_all_impl_attrs__())
        cls.__all_impl_attr_set__ = frozenset(impl_attr_seq)

        cached_attr_calc_pair_seq = tuple(
            cls.__iter_all_cached_attr_calc_pairs__())
        cls.__cached_attr2calc__ = MappingProxyType(
            dict(cached_attr_calc_pair_seq))
        try:
            if len(cls.__all_impl_attr_set__) != len(impl_attr_seq):
                raise TypeError
        except:
            print_err(cls.__all_impl_attr_set__)
            print_err(impl_attr_seq)
            from seed.iters.duplicate_elements import find_duplicate_element_groups
            print_err(find_duplicate_element_groups(impl_attr_seq))
            raise
        if len(cls.__cached_attr2calc__) != len(cached_attr_calc_pair_seq):
            raise TypeError

        cls.__all_primekey_attr_seq__ = tuple(
            cls.__iter_all_primekey_attrs__())
        cls.__all_user_attr_seq__ = tuple(cls.__iter_all_user_attrs__())
def iter_extract_ctext_org__url_rng(base_url, indices, index_format, *,
                                    verbose: bool, timeout, time_sep,
                                    **kwargs):
    # url -> begin -> end -> Iter (title, txt)
    #base_url = Path(base_url)
    verbose = bool(verbose)
    if verbose: print_err(f'fetch&extract webpages from: {base_url!r}')

    if base_url[-1:] == '/':
        base_url = base_url[:-1]
    referrer = base_url
    base_fmt = f'{base_url}/{index_format}'

    for i in indices:
        #str_i = str(i)
        #url = base_url / str_i; url = str(url)
        #url = os.path.join(base_url, str_i)
        #url = f'{base_url}/{i}'
        url = base_fmt.format(i)
        #print(url)
        while True:
            t = random.randrange(time_sep, 2 * time_sep)
            if verbose:
                print_err(
                    f'sleep {t}s before fetch&extract webpages from: {base_url!r}'
                )
            time.sleep(t)
            try:
                title, txt = extract_ctext_org__url(url,
                                                    verbose=verbose,
                                                    timeout=timeout,
                                                    referrer=referrer,
                                                    **kwargs)
            except CTextOrgConfirmError as e:
                #input('ctext.org requires confirm')
                action_url = e.action_url
                text_field_name = e.text_field_name
                while True:
                    try:
                        captcha = ask_captcha(
                            title='ctext.org requires confirm',
                            referrer=url,
                            timeout=max(10, timeout))
                        post_captcha(action_url=action_url,
                                     text_field_name=text_field_name,
                                     referrer=url,
                                     captcha=captcha,
                                     timeout=timeout)
                    except KeyboardInterrupt:
                        raise
                    except Exception as e2:
                        print_err(repr(e2))
                        if input('>>>'):
                            traceback.print_exc()
                            continue
                        else:
                            raise e2
                    else:
                        break
                continue
            except KeyboardInterrupt:
                raise
            except (Exception, OSError, *TimeoutErrors) as e:
                print_err(repr(e))
                continue
            break
        yield title, txt
Example #30
0
def main(args=None):
    import argparse
    from seed.io.may_open import may_open_stdin, may_open_stdout

    parser = argparse.ArgumentParser(
        description='count identifiers',
        epilog='',
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        default=None,
                        help='input file path')
    parser.add_argument('-g',
                        '--glob_pattern',
                        type=str,
                        default=None,
                        help='treat <input> as folder path')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        default=None,
                        help='output file path')
    parser.add_argument('-e',
                        '--encoding',
                        type=str,
                        default='utf8',
                        help='input/output file encoding')
    parser.add_argument('-f',
                        '--force',
                        action='store_true',
                        default=False,
                        help='open mode for output file')

    args = parser.parse_args(args)
    encoding = args.encoding
    omode = 'wt' if args.force else 'xt'

    d = {}

    def f(fin):
        for line in fin:
            feed(d, line)

    may_glob_pattern = args.glob_pattern
    if may_glob_pattern is None:
        may_ifname = args.input
        with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin:
            f(fin)
    else:
        glob_pattern = may_glob_pattern
        may_root = args.input
        root = '.' if not may_root else may_root
        for path in iter_files(root, glob_pattern):
            try:
                with open(path, 'rt', encoding=encoding) as fin:
                    f(fin)
            except UnicodeDecodeError:
                print_err(path)
                continue
            except:
                print_err(path)
                raise

    ls = lst(d)
    may_ofname = args.output
    with may_open_stdout(may_ofname, omode, encoding=encoding) as fout:
        show(fout, ls)