def main(argv=None): import argparse, sys from seed.io.may_open import may_open_stdout, may_open_stdin class Globals: output_file_encoding = 'utf8' input_file_encoding = 'utf8' ################### parser = argparse.ArgumentParser(description='merge html <p>' , epilog=''' <p></p> is the true seperator "<p>abc</p> <p>def</p>" ==>> "<p>abcdef</p>" ''') add_argument = parser.add_argument add_argument('-i', '--input_file', type=str , default=None , help='the input file') add_argument('-ie', '--input_encoding', type=str , default = Globals.input_file_encoding , help='the encoding of input file') add_argument('-o', '--output_file', type=str , default=None , help='the output file') add_argument('-oe', '--output_encoding', type=str , default = Globals.output_file_encoding , help='the encoding of output file') args = parser.parse_args(argv) with may_open_stdin(args.input_file, 'rt' , encoding=args.input_encoding) as fin: pseudo_htm = fin.read() txt = merge_html_p(pseudo_htm) if args.output_file is not None: # try output_encoding txt.encode(args.output_encoding) with may_open_stdout(args.output_file, 'xt' , encoding=args.output_encoding) as fout: fout.write(txt) #parser.exit(0) return 0 ########## with may_open_stdout(args.output_file, 'xt' , encoding=args.output_encoding) as fout\ , may_open_stdin(args.input_file, 'rt' , encoding=args.input_encoding) as fin: extract_fb_opf_items(fout, fin) #parser.exit(0) return 0
def main(args=None, /): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description='make new python module using template4module in python3_src/useful.txt' , epilog=r''' template from: view ../../python3_src/useful.txt between: #[[[[[template4module:begin #]]]]]template4module:end replace "xxx.yyy" to "pkg.module" #''' , formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument('-i', '--input', type=str, default=Globals.path4useful_txt , help='input file path for template4module') parser.add_argument('-o', '--output', type=str, default=None, required=True , help='output file path for target module') parser.add_argument('-e', '--encoding', type=str , default='utf8' , help='input/output file encoding') parser.add_argument('-f', '--force', action='store_true' , default = False , help='open mode for output file') args = parser.parse_args(args) encoding = args.encoding omode = 'wt' if args.force else 'xt' may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: txt = fin.read() begin = '#[[[[[template4module:begin\n' end = '#]]]]]template4module:end\n' substr4replace = r'xxx.yyy' i = txt.index(begin) + len(begin) j = txt.index(end, i) template4module = txt[i:j] ofname = args.output path4target_module = Path(ofname) if not '.py' == path4target_module.suffix: raise ValueError path4target_module = path4target_module.resolve() rpath = path4target_module.relative_to(Globals.this_pkg_root) s = rpath.as_posix() assert s.endswith('.py') s = s[:-3] if '.' in s: raise ValueError(s) module_qname = s.replace('/', '.') attrs = module_qname.split('.') if not attrs: raise ValueError if not all(attrs): raise ValueError if not all(attr.isidentifier() for attr in attrs): raise ValueError txt4output = template4module.replace(substr4replace, module_qname) with open(path4target_module, omode, encoding=encoding) as fout: fout.write(txt4output)
def main(argv=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description='extract first content_div of article on 360doc.com' ) parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-i', '--input', type=str, default = None , help='input file path') parser.add_argument('-o', '--output', type=str, default = None , help='output file path') parser.add_argument('-f', '--force', action='store_true' , default = False , help='open mode for output file') parser.add_argument('-url', '--url', type=str, default = None , help='input webpage url') args = parser.parse_args(argv) encoding = args.encoding omode = 'wt' if args.force else 'xt' if args.input is not None and args.url is not None: raise ValueError('input both file and url at same time') if args.url is not None: with open_webpage(args.url) as fin: content_div = extract_360doc_com(fin) else: may_ifname = args.input try: # open as text file with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: content_div = extract_360doc_com(fin) except UnicodeError: assert may_ifname is not None ifname = may_ifname # open as binary file with open(ifname, 'rb') as fin: content_div = extract_360doc_com(fin) if 0: print(len(content_div)) print(repr(content_div[5216:])) for i in range(len(content_div)): if ord(content_div[i]) > 0x7f: print(i) print(repr(content_div[i:])) break return may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: fout.write(content_div) parser.exit(0) return 0
def fs(may_ifnames, fout): for may_ifname in may_ifnames: with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: for line in fin: hexdigits_ls = line.split() try: f(hexdigits_ls, fout) except Exception: print(line, file=fout, end='') continue
def main(argv=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout encodingI = 'ascii' encodingO = 'utf8' parser = argparse.ArgumentParser( description='simple encrypt ascii text' , epilog=r'only " " and "\n" are allowed, other control/whitespace should not occur in input text' #, formatter_class=argparse.RawDescriptionHelpFormatter ) #parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('cmd', type=str, choices='encrypt decrypt'.split() , help='encrypt/decrypt - treat input as cleartext/ciphertext') parser.add_argument('psw', type=str , help='password: regex = [0-9a-f]*') parser.add_argument('-i', '--input', type=str, default = None , help='input file path') parser.add_argument('-o', '--output', type=str, default = None , help='output file path') parser.add_argument('-f', '--force', action='store_true' , default = False , help='open mode for output file') args = parser.parse_args(argv) psw = args.psw if not all(ch in aCrypt.char2idxP for ch in psw): raise TypeError omode = 'wt' if args.force else 'xt' does_encrypt = args.cmd == 'encrypt' iencoding = encodingI if does_encrypt else encodingO oencoding = encodingO if does_encrypt else encodingI may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=iencoding) as fin: input_text = ''.join(fin) if does_encrypt: message = input_text ciphertext = aCrypt.encrypt(psw, message) output_text = ciphertext else: ciphertext = input_text message = aCrypt.decrypt(psw, ciphertext) output_text = message may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=oencoding) as fout: fout.write(output_text) parser.exit(0) return 0
def main(argv=None): import argparse, sys parser = argparse.ArgumentParser( description='translate characters from lower to upper case') parser.add_argument('-i', '--input', type=str, default=None, help='input file name') parser.add_argument('-o', '--output', type=str, default=None, help='output file name') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='encoding of input/output file') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') parser.add_argument( '-s', '--sep', type=str, default=None, help='seperator string of output file which has 2 columes') parser.add_argument('-u', '--upper', action='store_true', default=False, help='lower2upper instead of upper2lower') args = parser.parse_args(argv) encoding = args.encoding omode = 'wt' if args.force else 'xt' may_ifname = args.input may_ofname = args.output with may_open_stdin(may_ifname, 'rt', encoding=encoding, newline='')\ as fin\ , may_open_stdout(may_ofname, omode, encoding=encoding, newline='')\ as fout: lower_file(fout, fin, upper=args.upper, sep=args.sep) parser.exit(0) return 0
def main(args=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description='invert bytes of file', epilog='', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-i', '--input', type=str, default=None, help='input file path') parser.add_argument('-o', '--output', type=str, default=None, help='output file path') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') parser.add_argument( '-a', '--auto_name_output', action='store_true', default=False, help='add/remove ".inv" to input fname for output fname') args = parser.parse_args(args) omode = 'wb' if args.force else 'xb' may_ifname = args.input may_ofname = args.output if (args.auto_name_output and may_ofname is None and may_ifname is not None): ext = '.inv' if may_ifname.endswith(ext): #bug:may_ofname = may_ifname[-len(ext):] may_ofname = may_ifname[:-len(ext)] else: may_ofname = may_ifname + ext with may_open_stdin(may_ifname, 'rb', encoding=None) as fin: with may_open_stdout(may_ofname, omode, encoding=None) as fout: while 1: bs = fin.read(BLOCK_SIZE) if not bs: break bs = bs.translate(TABLE) fout.write(bs)
def on_subcmd__update_branch(sf, subcmd_name, parsed_args): parsed_args.input from seed.io.may_open import may_open_stdin, may_open_stdout may_ifname = parsed_args.input encoding = parsed_args.encoding with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: #see:[location4fmt_of_file4result_of_dir_cmp__relative__extended] (lhs_branch_name, lhs_branch_idx4old, rhs_ignorefile_relative_path_encoding_pairs, result_of_dir_cmp__relative) = read__file4result_of_dir_cmp__relative__extended(fin) #options_update_branch if lhs_branch_name != parsed_args.lhs_branch_name: raise ValueError #if lhs_branch_idx4old != parsed_args.lhs_branch_idx: raise ValueError type(sf)._main4subcmds.on_update_lhs_branch(lhs_repository_extra_cache_root_dir_path=parsed_args.lhs_repository_extra_cache_root_dir_path, lhs_repository_root_dir_path=parsed_args.lhs_repository_root_dir_path, lhs_branch_name=lhs_branch_name, lhs_branch_idx4old=lhs_branch_idx4old, rhs_real_fsys_root_dir_path=parsed_args.rhs_real_fsys_root_dir_path, rhs_ignorefile_relative_path_encoding_pairs=rhs_ignorefile_relative_path_encoding_pairs, result_of_dir_cmp__relative=result_of_dir_cmp__relative, lcp_threshold=parsed_args.lcp_threshold) return
def main(argv=None): import argparse import sys parser = argparse.ArgumentParser( description='sort and unique chars in text.') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') parser.add_argument('-i', '--input', type=str, help='path to the input novel text file') parser.add_argument('-o', '--output', type=str, help='path to the output file') parser.add_argument('-u', '--repr_as_unicode', action='store_true', default=False, help='output char in format \\UXXXXXXXX') args = parser.parse_args(argv) encoding = args.encoding omode = 'wt' if args.force else 'xt' may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: #all_chars = file2unique_sorted_char_string(fin) all_chars = file2all_char_set(fin) #txt = chars2sorted_char_string(all_chars) s = chars2sorted_char_string(all_chars) if args.repr_as_unicode: s = repr_string_as_unicode(s) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: print(s, file=fout)
def main(argv=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description='parse pythoncoded_rules_in_str', epilog=example_doc, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-i', '--input', type=str, default=None, help='input file path') parser.add_argument('-o', '--output', type=str, default=None, help='output file path') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') args = parser.parse_args(argv) encoding = args.encoding omode = 'wt' if args.force else 'xt' may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: pythoncoded_rules_in_str = fin.read() python_code_str_ex = parse_ex(pythoncoded_rules_in_str, name2count=None, the_input_parameter_name='p', with_class_keyword=False) head_str, tail_str, name2count = python_code_str_ex may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: fout.write(head_str) fout.write(tail_str)
def main(argv=None): import argparse, sys from seed.io.may_open import may_open_stdout, may_open_stdin class Globals: output_file_encoding = 'utf8' input_file_encoding = 'utf8' ################### parser = argparse.ArgumentParser(description='extract epub/OPS/fb.opf', epilog=''' extract epub/OPS/fb.opf::manifest.item.href where item["media-type"]=="application/xhtml+xml" ''') add_argument = parser.add_argument add_argument('-i', '--input_file', type=str, default=None, help='the input file') add_argument('-ie', '--input_encoding', type=str, default=Globals.input_file_encoding, help='the encoding of input file') add_argument('-o', '--output_file', type=str, default=None, help='the output file') add_argument('-oe', '--output_encoding', type=str, default=Globals.output_file_encoding, help='the encoding of output file') args = parser.parse_args(argv) with may_open_stdout(args.output_file, 'xt' , encoding=args.output_encoding) as fout\ , may_open_stdin(args.input_file, 'rt' , encoding=args.input_encoding) as fin: extract_fb_opf_items(fout, fin) #parser.exit(0) return 0
def main(args=None, /): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description='简繁对称字-middle-parse', epilog='', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-i', '--input', type=str, default=None, help='input file path') parser.add_argument('-o', '--output', type=str, default=None, help='output file path') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') args = parser.parse_args(args) encoding = args.encoding omode = 'wt' if args.force else 'xt' may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: s = parser4pseudo_symmetric_hz_from_completed_chars_3980(fin) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: print( f'###parser4pseudo_symmetric_hz_from_completed_chars_3980:total={len(s)}###', file=fout) print(s, file=fout)
def main(argv=None): import argparse, sys parser = argparse.ArgumentParser( description='merge lines into paragraph for novel text') parser.add_argument('-i', '--input', type=str, default=None, help='input file name') parser.add_argument('-o', '--output', type=str, default=None, help='output file name') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='encoding of input/output file') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') args = parser.parse_args(argv) encoding = args.encoding omode = 'wt' if args.force else 'xt' may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: case_line_pairs = novel_merge_lines_into_paragraph__pattern( GlobalArgsExample.not_merge_pattern, GlobalArgsExample.transparent_pattern, GlobalArgsExample.case_pattern_pairs, iter(fin)) case_line_pairs = list(case_line_pairs) #print(case_line_pairs) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: for line in merge_case_line_pairs(case_line_pairs): print(line, file=fout)
def main(argv=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description='simple encape text to html' , epilog=r'''only "<>&" and "\"\'" if quote=True will be escaped.''' #, formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-i', '--input', type=str, default = None , help='input file path') parser.add_argument('-o', '--output', type=str, default = None , help='output file path') parser.add_argument('-f', '--force', action='store_true' , default = False , help='open mode for output file') parser.add_argument('-q', '--quote', action='store_true' , default = False , help=r'''escape "\"\'" too; otherwise only "<>&" be escaped''') args = parser.parse_args(argv) encoding = args.encoding omode = 'wt' if args.force else 'xt' may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: input_text = ''.join(fin) output_text = escape(input_text, args.quote) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: fout.write(output_text) parser.exit(0) return 0
def main(args=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description='format html/xml by well indent' , epilog='' , formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument('-i', '--input', type=str, default=None , help='input file path') parser.add_argument('-o', '--output', type=str, default=None , help='output file path') parser.add_argument('-e', '--encoding', type=str , default='utf8' , help='input/output file encoding') parser.add_argument('-f', '--force', action='store_true' , default = False , help='open mode for output file') parser.add_argument('-iw', '--indent_width', type=int , default = 1 , help='the number of indent spaces for children') args = parser.parse_args(args) encoding = args.encoding omode = 'wt' if args.force else 'xt' may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: soup = BeautifulSoup(fin, 'lxml') txt = soup.prettify() txt = replace_indent_spaces(args.indent_width, txt) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: fout.write(txt)
def main(args=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description= 'cut text file into many text files by detect head/tail line for each output files', epilog=r""" usiing python.re.match #not search/fullmatch for search: r".*?{pattern}" for fullmatch: r"^{pattern}$" #""", formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-i', '--input', type=str, default=None, help='input file path') parser.add_argument('-re', '--sep_line_regex', type=str, required=True, help='python regex pattern for input line') parser.add_argument( '-cs', '--sep_line_case', choices='head tail'.split(), required=True, help='input line which match sep_line_regex is head/tail') parser.add_argument('-od', '--output_dir', type=str, required=True, help='output dir path') parser.add_argument( '-ofmt', '--output_file_name_fmt', type=str, default='{0}.txt', help= "python str format of output file name; {0} for number_offset; eg, -ofmt '{0:0>4}.txt'" ) parser.add_argument('-oi', '--output_file_name_number_offset', type=str, default=0, help='number_offset of output file name') parser.add_argument('-n', '--max_sep_lines_per_ofile', type=int, required=True, help='max number of sep_lines per output_file') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') args = parser.parse_args(args) encoding = args.encoding omode = 'wt' if args.force else 'xt' may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: cut_text(fin, force=args.force, oencoding=encoding, odir=args.output_dir, ofname_number_offset=args.output_file_name_number_offset, ofname_fmt=args.output_file_name_fmt, sep_line_regex=re.compile(args.sep_line_regex), sep_line_case=args.sep_line_case, max_sep_lines_per_ofile=args.max_sep_lines_per_ofile)
def main(args=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description='count identifiers', epilog='', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-i', '--input', type=str, default=None, help='input file path') parser.add_argument('-g', '--glob_pattern', type=str, default=None, help='treat <input> as folder path') parser.add_argument('-o', '--output', type=str, default=None, help='output file path') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') args = parser.parse_args(args) encoding = args.encoding omode = 'wt' if args.force else 'xt' d = {} def f(fin): for line in fin: feed(d, line) may_glob_pattern = args.glob_pattern if may_glob_pattern is None: may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: f(fin) else: glob_pattern = may_glob_pattern may_root = args.input root = '.' if not may_root else may_root for path in iter_files(root, glob_pattern): try: with open(path, 'rt', encoding=encoding) as fin: f(fin) except UnicodeDecodeError: print_err(path) continue except: print_err(path) raise ls = lst(d) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: show(fout, ls)
def main(argv=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description='extract text on ctext.org or ctext.cn') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-i', '--input', type=str, default=None, help='input file path') parser.add_argument('-o', '--output', type=str, default=None, help='output file path') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') parser.add_argument('--append', action='store_true', default=False, help='open mode for output file') parser.add_argument('-V', '--verbose', action='store_true', default=False, help='show path/url that opened') parser.add_argument('-url', '--url', type=str, default=None, help='input webpage url') parser.add_argument( '-rng', '--range', type=int, default=None, nargs=2, help='input webpage url range (first, last); {url}/{i} for i in range') parser.add_argument( '-ifmt', '--index_format', type=str, default='{}', help='index python format for webpage url; base_url/{fmt}') parser.add_argument('--timeout', type=int, default=10, help='timeout for urllib') parser.add_argument('--time_sep', type=int, default=1, help='time space between two downloads') parser.add_argument('--without_book_title', action='store_true', default=False, help='not show book_title') parser.add_argument( '--book_title_at', type=str, default=None, help='extended url for book_title; {base_url}{book_title_at}') parser.add_argument( '--cache_fname', type=str, required=True, help= 'cache file name; to store middle extract data; Map url (title, txt)') parser.add_argument( '--captcha_image_db_fname', type=str, required=True, help= 'cache file name; to store (correct or wrong) captcha string and its image bytes; Map "{i}_{captcha}" (correct, image_bytes)' ) args = parser.parse_args(argv) encoding = args.encoding omode = 'wt' if args.force else 'xt' if args.append: omode = 'at' if args.input is not None and args.url is not None: raise ValueError('input both file and url at same time') if args.url is not None: self = ExtractCTextOrg( cache_fname=args.cache_fname, captcha_image_db_fname=args.captcha_image_db_fname) if args.range is None: title, txt = self.cached_extract_ctext_org__url( args.url, referrer=None, verbose=args.verbose, subcontents=False, timeout=args.timeout) may_book_title = None begin = 0 result = (may_book_title, begin, [(title, txt)]) elif 0: first, last = args.range begin, end = first, last + 1 rng = range(begin, end) base_url = args.url index_format = args.index_format it = self.unordered_iter_extract_ctext_org__url_rng( base_url, rng, index_format, verbose=args.verbose, timeout=args.timeout, time_sep=args.time_sep) if args.without_book_title: may_book_title = None else: if args.book_title_at is None: book_title_url = base_url else: book_title_url = f'{base_url}{args.book_title_at}' book_title, _ = self.cached_extract_ctext_org__url( book_title_url, referrer=None, verbose=args.verbose, subcontents=False, timeout=args.timeout) may_book_title = book_title for _ in it: pass it = self.ordered_iter_extract_ctext_org__url_rng__cache_only( base_url, rng, index_format) begin = begin #result = (may_book_title, begin, list(it)) result = (may_book_title, begin, iter(it)) else: first, last = args.range if (first, last) == (0, 0): first, last = 1, None begin, end = None, None else: assert first >= 1 begin, end = first - 1, last #rng = range(begin, end) base_url = args.url if args.book_title_at is None: book_title_url = base_url else: book_title_url = f'{base_url}{args.book_title_at}' ((book_title, book_url), subtitle_url_pairs) = self.cached_extract_ctext_org__url( book_title_url, referrer=None, verbose=args.verbose, subcontents=True, timeout=args.timeout) if args.without_book_title: may_book_title = None else: may_book_title = book_title subtitle_url_pairs = subtitle_url_pairs[begin:end] referrer_url_pairs = [(book_url, sub_url) for subtitle, sub_url in subtitle_url_pairs] it = self.unordered_iter_extract_ctext_org__referrer_url_pairs( referrer_url_pairs, verbose=args.verbose, timeout=args.timeout, time_sep=args.time_sep) for _ in it: pass def tmp__ordered_iter_extract_ctext_org__url_rng__cache_only(): for (referrer, url), (subtitle, _) in zip(referrer_url_pairs, subtitle_url_pairs): title, txt = self.cache[url] #yield title, txt yield subtitle, txt it = tmp__ordered_iter_extract_ctext_org__url_rng__cache_only() begin = first #result = (may_book_title, begin, list(it)) result = (may_book_title, begin, iter(it)) else: self = ExtractCTextOrgBase() may_ifname = args.input try: # open as text file with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: title, txt = self.extract_ctext_org__text(fin, verbose=args.verbose, timeout=args.timeout) except UnicodeError: assert may_ifname is not None ifname = may_ifname # open as binary file with open(ifname, 'rb') as fin: title, txt = self.extract_ctext_org__text(fin, verbose=args.verbose, timeout=args.timeout) may_book_title = None begin = 0 result = (may_book_title, begin, [(title, txt)]) #result :: (may_book_title, begin, [(title, txt)]) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: def fprint(*args, **kwargs): print(*args, file=fout, **kwargs) may_book_title, begin, title_txt_pairs = result if not args.without_book_title: fprint(f'[book]:{may_book_title}') for i, (title, txt) in enumerate(title_txt_pairs, begin): fprint(f'[chapter{i}]:{title}') fprint(txt) if hasattr(self, 'close'): self.close() parser.exit(0) return 0
def main(argv=None): ''' ''' import argparse, sys parser = argparse.ArgumentParser(description='extract info from files', allow_abbrev=False, epilog=''' NOTE: extract_data_cmd -oe gbk -o ./1+2.txt -ie ascii -i ./paths.txt nn_ns.filedir._extractor_example.main --encoding=utf8 where: -ie ascii the encoding of input file which contains paths to files from which data were extracted. arg of this program --encodinga utf8 the encoding of all files from which data were extracted. arg of nn_ns.filedir._extractor_example glob_cmd ./*.html | line_filter_cmd chapter(\d+)\.html --group_names 1 --INT_GROUP | sort_lines_cmd --line_type=KEY_LINE | extract_data_cmd -oe gbk -o ./1+2.txt nn_ns.filedir._extractor_example.main --encoding=utf8 ''') add_argument = parser.add_argument add_argument('extractor', type=str, help='fullname of a python function: e.g. math.log2') add_argument('-i', '--input_file', type=str, default=None, help='the input file which contains paths') add_argument('-ie', '--input_encoding', type=str, default=Globals.input_file_encoding, help='the encoding of input file') add_argument('-o', '--output_file', type=str, default=None, help='the output file') add_argument('-oe', '--output_encoding', type=str, default=Globals.output_file_encoding, help='the encoding of output file') args, unknown_args = parser.parse_known_args(argv) _args, _kwargs = parse_unknown_args(unknown_args) _kwargs = dict(_kwargs) extractor_qname = args.extractor extractor = import_object(extractor_qname) # extractor :: (fout, input_fname, **kwargs) -> None with may_open_stdout(args.output_file, 'xt' , encoding=args.output_encoding) as fout\ , may_open_stdin(args.input_file, 'rt' , encoding=args.input_encoding) as fin: for line in fin: if line[-1:] == '\n': line = line[:-1] path = line extractor(fout, path, *_args, **_kwargs) #parser.exit() return
def main(argv=None): ''' ''' import argparse, sys parser = argparse.ArgumentParser(description='filter lines') add_argument = parser.add_argument add_argument('regex_patterns', type=str , nargs='+' , metavar='REGEX_PATTERN' , help='regular expressions; if line match i-th regex, then put it into the i-th output slot; finally, all slots will be chained together') add_argument('--group_names', type=str , nargs='*' , metavar='NAME' , help='if group_names was set, then snap that named group of regex; each line will be output as ((groups...), repr(line))') '''#group name can not startswith digit... add_argument('--INT_NAME', action='store_true' , default=False , help='a group name is treated as integer if possible') ''' add_argument('--INT_GROUP', action='store_true' , default=False , help='a named group is treated as integer if possible') add_argument('-i', '--input_file', type=str , default=None , help='the input file') add_argument('-ie', '--input_encoding', type=str , default = Globals.input_file_encoding , help='the encoding of input file') add_argument('-o', '--output_file', type=str , default=None , help='the output file') add_argument('-oe', '--output_encoding', type=str , default = Globals.output_file_encoding , help='the encoding of output file') args = parser.parse_args(argv) re_objs = list(map(re.compile, args.regex_patterns)) def to_int_if_possible(s): try: return int(s) except: return s def to_ints_if_possible(strs): return list(map(to_int_if_possible, strs)) def get_groups(m, group_names): # m.group(*group_names) will error if len <= 1 return tuple(map(m.group, group_names)) slots = [[] for _ in range(len(re_objs))] if args.group_names:# and args.INT_NAME: args.group_names = to_ints_if_possible(args.group_names) with may_open_stdout(args.output_file, 'xt' , encoding=args.output_encoding) as fout\ , may_open_stdin(args.input_file, 'rt' , encoding=args.input_encoding) as fin: EOF = False for line in fin: if EOF: raise logic-error if line == '': # EOF? EOF = True continue if line[-1:] == '\n': line = line[:-1] for i, rex in enumerate(re_objs): #m = rex.match(line) m = rex.search(line) if m: break else: # no match # drop this line continue if args.group_names: # named_groups groups = get_groups(m, args.group_names) if args.INT_GROUP: # convert to int groups = to_ints_if_possible(groups) groups = tuple(groups) out_line = repr((groups, line)) else: out_line = line # no repr!! #print(out_line, file=fout) slots[i].append(out_line) for out_line in chain.from_iterable(slots): print(out_line, file=fout) #parser.exit() return
def main(argv=None): ''' ''' import argparse, sys parser = argparse.ArgumentParser(description='stable sort lines') add_argument = parser.add_argument add_argument('--line_type', choices='RAW_LINE KEY_LINE'.split(), default=RAW_LINE, help='type of input lines') add_argument('--unique', action='store_true', default=False, help='remove duplicate lines') add_argument('--reverse', action='store_true', default=False, help='stable sort by reverse ordering') add_argument('-i', '--input_file', type=str, default=None, help='the input file') add_argument('-ie', '--input_encoding', type=str, default=Globals.input_file_encoding, help='the encoding of input file') add_argument('-o', '--output_file', type=str, default=None, help='the output file') add_argument('-oe', '--output_encoding', type=str, default=Globals.output_file_encoding, help='the encoding of output file') args = parser.parse_args(argv) if args.line_type == RAW_LINE: def input_line2val(input_line): val = org_line = input_line return val def val2org_line(val): return val elif args.line_type == KEY_LINE: def input_line2val(input_line): assert input_line[-1:] != '\n' try: keys, org_line = ast.literal_eval(input_line) except: print(input_line) print(repr(input_line)) raise assert type(keys) is tuple return keys, org_line def val2org_line(val): keys, org_line = val return org_line vals = [] # [org_line] | [(keys, org_line)] with may_open_stdout(args.output_file, 'xt' , encoding=args.output_encoding) as fout\ , may_open_stdin(args.input_file, 'rt' , encoding=args.input_encoding) as fin: EOF = False for line in fin: #print(repr(line)) #bug: # xxx.bat xxx | this_cmd # xxx.bat should be: # @xxx_cmd args # @yyy_cmd args if EOF: raise logic - error if line == '': # EOF? EOF = True continue if line[-1:] == '\n': line = line[:-1] input_line = line vals.append(input_line2val(input_line)) vals.sort(reverse=args.reverse) if args.unique: vals = [unique_line for unique_line, _ in groupby(vals)] for val in vals: org_line = val2org_line(val) print(org_line, file=fout) #parser.exit() return
def main(argv=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout parser = argparse.ArgumentParser( description='extract text on ctext.org or ctext.cn') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-i', '--input', type=str, default=None, help='input file path') parser.add_argument('-o', '--output', type=str, default=None, help='output file path') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') parser.add_argument('--append', action='store_true', default=False, help='open mode for output file') parser.add_argument('-V', '--verbose', action='store_true', default=False, help='show path/url that opened') parser.add_argument('-url', '--url', type=str, default=None, help='input webpage url') parser.add_argument( '-rng', '--range', type=int, default=None, nargs=2, help='input webpage url range (first, last); {url}/{i} for i in range') parser.add_argument( '-ifmt', '--index_format', type=str, default='{}', help='index python format for webpage url; base_url/{fmt}') parser.add_argument('--timeout', type=int, default=10, help='timeout for urllib') parser.add_argument('--time_sep', type=int, default=1, help='time space between two downloads') parser.add_argument('--without_book_title', action='store_true', default=False, help='not show book_title') parser.add_argument( '--book_title_at', type=str, default=None, help='extended url for book_title; {base_url}{book_title_at}') args = parser.parse_args(argv) encoding = args.encoding omode = 'wt' if args.force else 'xt' if args.append: omode = 'at' if args.input is not None and args.url is not None: raise ValueError('input both file and url at same time') if args.url is not None: if args.range is None: title, txt = extract_ctext_org__url(args.url, verbose=args.verbose, timeout=args.timeout) may_book_title = None begin = 0 result = (may_book_title, begin, [(title, txt)]) else: first, last = args.range begin, end = first, last + 1 rng = range(begin, end) base_url = args.url index_format = args.index_format it = iter_extract_ctext_org__url_rng(base_url, rng, index_format, verbose=args.verbose, timeout=args.timeout, time_sep=args.time_sep) if args.without_book_title: may_book_title = None else: if args.book_title_at is None: book_title_url = base_url else: book_title_url = f'{base_url}{args.book_title_at}' book_title, _ = extract_ctext_org__url(book_title_url, verbose=args.verbose, timeout=args.timeout) may_book_title = book_title begin = begin #result = (may_book_title, begin, list(it)) result = (may_book_title, begin, iter(it)) else: may_ifname = args.input try: # open as text file with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: title, txt = extract_ctext_org(fin, verbose=args.verbose, timeout=args.timeout) except UnicodeError: assert may_ifname is not None ifname = may_ifname # open as binary file with open(ifname, 'rb') as fin: title, txt = extract_ctext_org(fin, verbose=args.verbose, timeout=args.timeout) may_book_title = None begin = 0 result = (may_book_title, begin, [(title, txt)]) #result :: (may_book_title, begin, [(title, txt)]) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: def fprint(*args, **kwargs): print(*args, file=fout, **kwargs) may_book_title, begin, title_txt_pairs = result if not args.without_book_title: fprint(f'[book]:{may_book_title}') for i, (title, txt) in enumerate(title_txt_pairs, begin): fprint(f'[chapter{i}]:{title}') fprint(txt) parser.exit(0) return 0
def main(args=None, /): from pprint import pprint import argparse from seed.io.may_open import may_open_stdin, may_open_stdout if 0: help(pprint) #pprint(object, stream=None, indent=1, width=80, depth=None, *, compact=False, sort_dicts=True) return parser = argparse.ArgumentParser( description='parse unicode::UCD::PropList.txt', epilog='', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--show_property_names_only', action='store_true', default=False, help='output property_names without uint-rngs') parser.add_argument('--hex', action='store_true', default=False, help='output uint-rngs in hex/radix<16>') parser.add_argument('-i', '--input', type=str, default=None, help='input file path for unicode::UCD::PropList.txt') parser.add_argument('-o', '--output', type=str, default=None, help='output file path') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') args = parser.parse_args(args) encoding = args.encoding omode = 'wt' if args.force else 'xt' may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: #lines = [*fin] lines = iter(fin) parsed_result = parsed_result = parse__PropList_txt( lines, result_readonly=False) repr_result = parsed_result2literal_text(parsed_result, decimal_vs_hex=args.hex) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: print = mk_fprint(fout) if args.show_property_names_only: attr2rngs = parsed_result for property_name in sorted(attr2rngs): print(property_name) else: #pprint(parsed_result, stream=fout, indent='') stable_repr print(repr_result)
def main(argv=None): import argparse import sys parser = argparse.ArgumentParser( description='find(and replace) non gbk char in novel text.') parser.add_argument('-e', '--encoding', type=str, default='utf8', help='input/output file encoding') parser.add_argument('-f', '--force', action='store_true', default=False, help='open mode for output file') parser.add_argument('-i', '--input', type=str, help='path to the input novel text file') parser.add_argument('-o', '--output', type=str, help='path to the output file') parser.add_argument( '-r', '--replace', action='store_true', default=False, help='output replaced file instead of sorted non-gbk chars') args = parser.parse_args(argv) encoding = args.encoding omode = 'wt' if args.force else 'xt' may_ifname = args.input #all_chars = set() with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: if not args.replace: all_chars = file2all_char_set(fin) else: txt = fin.read() all_chars = set(txt) chars = find_nonGBK_chars(all_chars) #chars.update(ch for ch in all_chars if not ch.isprintable()) #chars.update(ch for ch in all_chars if ord(ch) < 0x100 and (ord(ch) > 0x7F or not ch.isalnum())) if not chars: print('no nonGBK chars at all', file=sys.stderr) return assert chars if not args.replace: s = repr_string_as_unicode(sorted(chars)) else: txt #bug:pattern = '[' + '|'.join(map(repr_char_as_unicode, chars)) + ']' #pattern = '[' + ''.join(map(repr_char_as_unicode, chars)) + ']' pattern = make_chars_pattern(chars) def replace(m): char = m.group(0) return repr_char_as_unicode(char) # \Uxxxxxxxx ; no "" s = re.sub(pattern, replace, txt) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: print(s, file=fout) '''
def main(args=None): import argparse from seed.io.may_open import may_open_stdin, may_open_stdout from pprint import pprint parser = argparse.ArgumentParser( description="find out relationship of cjk part of encodings (postprocess of charset_filter.py)" , epilog="" , formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument('-e', '--encoding', type=str , default='utf8' , help='input/output file encoding') parser.add_argument('-i', '--input', type=str, default=None , help='input file path') parser.add_argument('-o', '--output', type=str, default=None , help='output file path') parser.add_argument('-f', '--force', action='store_true' , default = False , help='open mode for output file') parser.add_argument('-o2', '--output2', type=str, default=None , help='output2 file path') parser.add_argument('-f2', '--force2', action='store_true' , default = False , help='open mode for output2 file') args = parser.parse_args(args) encoding = args.encoding omode = 'wt' if args.force else 'xt' omode2 = 'wt' if args.force2 else 'xt' may_ifname = args.input with may_open_stdin(may_ifname, 'rt', encoding=encoding) as fin: txt = fin.read() cjk_d = txt2cjk_d(txt) encoding2cjk_part_rngs = cjk_d encoding2relation2encodings = subset_relation_of_encoding2cjk_part_rngs(encoding2cjk_part_rngs) more_result = handle_encoding2relation2encodings(encoding2relation2encodings) more_result2 = handle_more_result(more_result, encoding2cjk_part_rngs) may_ofname = args.output with may_open_stdout(may_ofname, omode, encoding=encoding) as fout: pprint(encoding2relation2encodings, stream=fout) may_ofname2 = args.output2 with may_open_stdout(may_ofname2, omode2, encoding=encoding) as fout2: pprint(more_result, stream=fout2) ks = r""" encoding_cjk_part_eq_classes__txt std_cjk_eq_encoding_lt_pairs__txt std_cjk_eq_encoding_atomic_lt_pairs__txt atomic_buttomup__txt atomic_topdown__txt std_cjk_eq_encoding2cjk_part_size__txt snd_buttoms__txt snd_buttom_subset2nonempty_common_rngs__txt snd_buttom_subsets_with_empty_common_rngs__txt """.split() show_more_result2(ks, more_result2)