def scrape(input_cfg=None): cfg = jtutils.process_cfg(input_cfg, parser(), internal_args()) if cfg["infile"]: with open(cfg["infile"]) as f_in: soup = jtutils.html_to_soup(f_in.read()) elif cfg["html"]: soup = jtutils.html_to_soup(cfg["html"]) elif cfg["url"]: soup = jtutils.url_to_soup(cfg["url"], cfg["js"], None, cfg["cookies"], cfg["headers"], cfg["params"]) else: raise return scrape_soup(soup, cfg)
def pcsv(input_cfg=None): cfg = process_cfg(input_cfg, parser(), internal_args()) if input_cfg and not cfg["input"] and not cfg["infile"]: raise Exception("Couldn't find input for pawk") if sys.stdin.isatty() and (not cfg["input"]) and (not cfg["infile"]): sys.stderr.write( "WARNING: pcsv using /dev/stdin as default input file (-f) but nothing seems to be piped in..." + "\n") #for non commandline, capture the sys.stdout backup = None if input_cfg: # not running from pawk script backup = sys.stdout sys.stdout = six.StringIO() if cfg["input"]: f_in = six.StringIO(cfg["input"]) elif not cfg["infile"]: f_in = sys.stdin else: if sys.version_info[0] >= 3: f_in = open(cfg["infile"], errors='ignore') #don't crash on invalid unicode else: f_in = open(cfg["infile"]) if cfg["delimiter"] == "TAB": cfg["delimiter"] = '\t' elif cfg["delimiter"] == "\\t": cfg["delimiter"] = '\t' keep_list = process_cut_csv(cfg["keep_list"]) drop_list = process_cut_csv(cfg["drop_list"]) in_hdr = None out_hdr = None has_exceptions = False has_printed_incomplete_line = False # do_write = process_code and ("print" in process_code or "write_line" in process_code) begin_code = None process_code = None end_code = None grep_code = None if cfg["begin_code"]: _check_is_list(cfg, "begin_code") begin_code = [pindent(code) for code in cfg["begin_code"]] begin_code = [compile(code, '', 'exec') for code in cfg["begin_code"]] if cfg["grep_code"]: grep_code = pindent(cfg["grep_code"]) #preprocess /.*/ syntax grep_code = gen_grep_code(grep_code) grep_code = compile(grep_code, '', 'eval') if cfg["process_code"]: _check_is_list(cfg, "process_code") process_code = [pindent(code) for code in cfg["process_code"]] process_code = [compile(code, '', 'exec') for code in process_code] if cfg["end_code"]: _check_is_list(cfg, "end_code") end_code = [pindent(code) for code in cfg["end_code"]] end_code = [compile(code, '', 'exec') for code in end_code] if begin_code: for code in begin_code: exec(code) if cfg["set"]: s = set(l.strip() for l in open(cfg["set"])) #main iteration loop for i, (l, _csvlist) in enumerate( csv_row_and_raw(f_in, delimiter=cfg["delimiter"])): is_header_line = (i == 0 and not cfg["no_header"]) if not in_hdr and cfg["no_header"]: #create a dummy header from the length of the line in_hdr = ["X" + str(j) for j, _ in enumerate(_csvlist)] hdrhash = dict((jx, j) for j, jx in enumerate(in_hdr)) r = IndexDict( hdrhash, _csvlist ) #IndexDict can be accessed by string or index (all keys must be strings) elif not in_hdr: #read in the header in_hdr = _csvlist[:] if len(in_hdr) != len(set(in_hdr)): sys.stderr.write( "WARNING: duplicated header columns. Using dummy header instead" + '\n') #create a dummy header from the length of the line in_hdr = rename_duplicate_header(_csvlist) hdrhash = dict((jx, j) for j, jx in enumerate(in_hdr)) if not _csvlist: _csvlist = [''] * len(in_hdr) r = IndexDict( hdrhash, _csvlist ) #IndexDict can be accessed by string or index (all keys must be strings) if cfg["no_print"]: #TODO: what's this block for? for code in process_code: exec(code) continue #_csvlist is the header, don't continue to process as row else: #setup for regular rows if len(_csvlist) != len(in_hdr): if cfg["fix"]: sys.stdout.write(l + "\n") continue elif cfg["autofix"]: continue elif not has_printed_incomplete_line: raise Exception( "ERROR: line length not equal to header length. Try running pcsv.py --fix or pcsv.py --autofix" ) # sys.stderr.write("Header length " + str(len(hdr)) + "." + " Row length " + str(len(_csvlist)) + "." + "\n") # csv.writer(sys.stderr, lineterminator= '\n').writerows([_csvlist]) has_printed_incomplete_line = True if not _csvlist: _csvlist = [''] * len(in_hdr) r = IndexDict( hdrhash, _csvlist ) #IndexDict can be accessed by string or index (all keys must be strings) #run process and grep code try: if grep_code and not is_header_line and not eval(grep_code): continue if process_code and (not is_header_line): for code in process_code: exec(code) except: if not cfg["exceptions_allowed"]: raise else: if not has_exceptions: sys.stderr.write("WARNING: exception" + '\n') has_exceptions = True continue #print header after processing the first row #(this allows auto adding of new columns) #like new in this case -p 'r["new"] = 2 * float(r["old"])' if not cfg["no_print"] and not out_hdr: out_hdr = print_header(in_hdr, r, keep_list, drop_list) #print line if cfg["fix"] or cfg["no_print"]: pass else: rout = [str(r[h]) for h in out_hdr] write_line(rout) #print header if not printed yet #eg file has only a header and no rows if not cfg["no_print"] and not out_hdr: out_hdr = print_header(in_hdr, r, keep_list, drop_list) if end_code: for code in end_code: exec(code) #for sys.stdout if input_cfg: #not running from the script out = sys.stdout.getvalue() sys.stdout = backup return out
def pawk(input_cfg=None): cfg = process_cfg(input_cfg, parser(), internal_args()) if input_cfg and not cfg["input"] and not cfg["infile"]: raise Exception("Couldn't find input for pawk") if sys.stdin.isatty() and (not cfg["input"]) and (not cfg["infile"]): sys.stderr.write( "WARNING: pawk using /dev/stdin as default input file (-f) but nothing seems to be piped in..." + "\n") #for non commandline, capture the sys.stdout backup = None if input_cfg: # not running from pawk script backup = sys.stdout sys.stdout = six.StringIO() if cfg["input"]: f_in = six.StringIO(cfg["input"]) elif not cfg["infile"]: f_in = sys.stdin else: if sys.version_info[0] >= 3: f_in = open(cfg["infile"], errors='ignore') #don't crash on invalid unicode else: f_in = open(cfg["infile"]) if cfg["delimiter"] == "TAB": cfg["delimiter"] = '\t' elif cfg["delimiter"] == "\\t": cfg["delimiter"] = '\t' hdr = None has_exceptions = False has_printed_incomplete_line = False #jtrigg@20160102 try out only writing when there's no -p option # do_write = cfg["process_code"] and ("print" in cfg["process_code"] or "write_line" in cfg["process_code"]) do_write = cfg["process_code"] if cfg["set"]: s = set(l.strip() for l in open(cfg["set"])) begin_code = None process_code = None end_code = None grep_code = None if cfg["begin_code"]: _check_is_list(cfg, "begin_code") begin_code = [pyindent(c) for c in cfg["begin_code"]] begin_code = [compile(code, '', 'exec') for code in begin_code] if cfg["grep_code"]: if isinstance(cfg["grep_code"], list): raise Exception("grep_code can't be list") #preprocess /.*/ syntax grep_code = gen_grep_code(cfg["grep_code"]) grep_code = pyindent(grep_code) grep_code = compile(grep_code, '', 'eval') if cfg["process_code"]: _check_is_list(cfg, "process_code") process_code = [pyindent(c) for c in cfg["process_code"]] process_code = [compile(code, '', 'exec') for code in process_code] if cfg["end_code"]: _check_is_list(cfg, "end_code") end_code = [pyindent(c) for c in cfg["end_code"]] end_code = [compile(code, '', 'exec') for code in end_code] if begin_code: for code in begin_code: #NOTE: this code appears in a couple places #but it breaks if it's wrapped in a function because exec() #uses the existing environment try: exec(code) except: if backup: sys.stdout = backup raise for i, (l, _csvlist) in enumerate( csvlist_and_raw(f_in, cfg["delimiter"], multiline=cfg["multiline"])): # sys.stderr.write(str(i) + "\n") # sys.stderr.write(str(l) + "\n") # raise r = _csvlist try: # print r,process_code if grep_code: try: if not eval(grep_code): continue except: if backup: sys.stdout = backup raise if process_code: for code in process_code: try: exec(code) except: if backup: sys.stdout = backup raise except: if not cfg["exceptions_allowed"]: raise else: if not has_exceptions: sys.stderr.write("WARNING: exception" + '\n') has_exceptions = True continue if not do_write: write_line(r, cfg["output_delimiter"]) if end_code: for code in end_code: try: exec(code) except: if backup: sys.stdout = backup raise #for sys.stdout if input_cfg: #not running from the pawk script out = sys.stdout.getvalue() sys.stdout = backup return out