def perform(dirname, stype, convert=None, limit=None): log.info("limit = %s" % limit) log.info("dirname = %s" % dirname) status = defaultdict(list) pairs = walkdir(dirname, stype, ['pdf']) pairs = islice(pairs, limit) for dirpath, fname in pairs: _stype, bbl, ext = split_fname(fname) if _stype != stype: raise ValueError("stype mismatch") log.debug("stype,bbl = %s,%d .." % (_stype, bbl)) status['seen'] += [bbl] infile = "%s/%s" % (dirpath, fname) outfile = "%s/%s-%d.txt" % (dirpath, stype, bbl) if os.path.exists(outfile): log.info("SKIP %d" % bbl) status['skip'] += [bbl] continue try: convert(infile, outfile) log.info("GOOD %d" % bbl) status['good'] += [bbl] except Exception as e: log.info("FAIL %d %s" % (bbl, e)) log.exception(e) status['fail'] += [bbl] return status
def process_target(pulldir, stype, bbl): subdir = bbl2dirpath(bbl) fname = "%s-%d.pdf" % (stype, bbl) infile = "%s/%s/%s" % (pulldir, subdir, fname) log.info("infile = %s" % infile) if os.path.exists(infile): with open(infile, "rb") as f: return parse(f) else: return None
def inventory(dirname, stype): """Make a quick inventory of which BBLs we have .pdf/.txt files for.""" log.info('..') seen = defaultdict(set) extset = ('pdf', 'txt', 'html') pairs = walkdir(dirname, stype, extset) for dirpath, fname in pairs: stype, bbl, ext = split_fname(fname) seen[ext].add(bbl) return seen
def init_targets(d, args): if args.targets: targetsfile = args.targets else: targetsfile = "%s/targets.txt" % d['meta'] print("targets from '%s' .." % targetsfile) targets = list(read_ints(targetsfile)) targets = refine(targets, args) print("that be %d targets." % len(targets)) log.info("that be %d targets." % len(targets)) return targets
def rename_target(pulldir, stype, bbl): c = Counter() subdir = bbl2dirpath(bbl) pathbase = "%s/%s/%s-%d" % (pulldir, subdir, stype, bbl) oldpath = "%s.pdf" % pathbase newpath = "%s.html" % pathbase if os.path.exists(oldpath): log.info("MOVE oldpath = %s" % oldpath) os.rename(oldpath, newpath) c['move'] += 1 else: log.info("SKIP oldpath = %s" % oldpath) c['skip'] += 1 return c
def run(spec, targets): log.info("..") t0 = time.time() x = dispatch(d['pull'], args.spec, targets) delta = time.time() - t0 tally = {k: len(v) for k, v in x.items()} log.info("done in %.3f sec" % delta) log.info("tally = %s" % tally)
def refine(targets, args): """Create a refined list of targets, according to argument flags.""" bounds = parsetup(args.bounds) if args.bounds else None if bounds: i, j = bounds log.info("that be %d raw targets; restricting by range .." % len(targets)) targets = targets[i:j] pace = parsetup(args.pace) if args.pace else None if pace: log.info("that be %d raw targets; restricting by pace .." % len(targets)) targets = list(modslice(targets, *pace)) log.info("that be %d targets" % len(targets)) return targets
def _walkdir(): for i, bbl in enumerate(targets): log.debug("bbl = %s" % bbl) try: d = process_target(pulldir, stype, bbl) except Exception as e: log.info("FAIL %s = %s" % (bbl, e)) log.exception(e) status['fail'] += [bbl] continue if d: log.info("GOOD %s" % bbl) status['good'] += [bbl] yield from expand(bbl, d) else: log.info("MISS %s" % bbl) status['miss'] += [bbl]
def purge_target(pulldir, stype, bbl): c = Counter() subdir = bbl2dirpath(bbl) pathbase = "%s/%s/%s-%d" % (pulldir, subdir, stype, bbl) checkpath = "%s.html" % pathbase if os.path.exists(checkpath): badpath = "%s.txt" % pathbase if os.path.exists(badpath): os.remove(badpath) log.info("KILL badpath = %s" % badpath) c['kill'] += 1 else: log.info("SKIP badpath = %s" % badpath) c['skip-txt'] += 1 else: log.info("SKIP checkpath = %s" % checkpath) c['skip-html'] += 1 return c
def rescue_target(pulldir, stype, bbl): c = Counter() subdir = bbl2dirpath_short(bbl) extlist = ['pdf', 'txt', 'html'] for ext in extlist: oldpath = "%s/%s/%d.%s" % (pulldir, subdir, bbl, ext) if os.path.exists(oldpath): log.info("PUSH oldpath = %s" % oldpath) newdir = make_bbl_dir(pulldir, bbl) newfile = "%s-%d.%s" % (stype, bbl, ext) newpath = "%s/%s" % (newdir, newfile) log.info("DEST newpath = %s" % newpath) os.rename(oldpath, newpath) c['push'] += 1 else: log.info("SKIP oldpath = %s" % oldpath) c['skip'] += 1 return c
def extract_unitcount(page): """Extracts what appears to be the stabilized unit count from a given page. Our current hypothesis is that whenever string appears directly after the label 'Housing-Rent Stabilization', -and- this string looks like an integer, then that's the unit count (and this number will be the same regardless of how many times such a match occurs throughout the page). If more than one match is detected, we simply emit the first matching value and make a note of this weirdness in the logs.""" rawvals = list(yield_after(page, 'Housing-Rent Stabilization')) log.info("RAW %s" % rawvals) counts = [int(_) for _ in rawvals if re.match(pat['integer'], _)] if len(counts) < 1: if rawvals: # Means we get datefield entries, but no actual unit counts. # Seems to happen only on a small number of lots with but past-due charges # but apparently no longer having stabilized units. log.info("WEIRD rawvals = %s but none are integer" % rawvals) # Either way, interpret as having no units. return None if len(counts) > 1: # Happens occasionally for lots with multiple buildings. log.info("WEIRD multiple unitcount values %s" % counts) return sum(counts) return counts[0]
parser.add_argument("--random", required=False, action="store_true", help="randomize targets") parser.add_argument("--targets", required=False, type=str, help="explicit targets list") group = parser.add_mutually_exclusive_group(required=True) group.add_argument("--check", action="store_true", help="check") group.add_argument("--parse", action="store_true", help="parse") group.add_argument("--rescue", action="store_true", help="rescue") group.add_argument("--movepdfs", action="store_true", help="rescue") # parser.add_argument("--loud", required=False, action="store_true", help="emit more data") args = parser.parse_args() log.info("args = %s" % args) print("pid = %s" % os.getpid()) def init_targets(d, args): if args.targets: targetsfile = args.targets else: targetsfile = "%s/targets.txt" % d['meta'] print("targets from '%s' .." % targetsfile) targets = list(read_ints(targetsfile)) targets = refine(targets, args) print("that be %d targets." % len(targets)) log.info("that be %d targets." % len(targets)) return targets
def process(pulldir, stype, pubdate, bbl): agent = Agent() dirpath = make_bbl_dir(pulldir, bbl) outfile = "%s/%s-%s.pdf" % (dirpath, stype, bbl) if os.path.exists(outfile): log.info("SKIP %s" % bbl) return 'skip' log.info("search %s .." % bbl) r = agent.search(bbl) if r.status_code != 200: log.info("FAIL %s" % bbl) return 'fail' log.info("grab %s .." % bbl) r = agent.grab(bbl, pubdate, stype) if r is None: log.info("FAIL exception - %s" % bbl) return 'fail' ctype = r.headers.get('Content-Type') log.info("grab.content-type %s = %s" % (bbl, ctype)) if r.status_code != 200: log.info("FAIL %d - %s" % (r.status_code, bbl)) return 'fail' if ctype is None: log.info("ERROR %s - bad content type '%s'" % (bbl, ctype)) return 'error' if ctype.startswith('application/pdf'): log.info("GOOD %s - pdf" % bbl) save_pdf(outfile, r.content) return 'good' elif ctype.startswith('text/html'): log.info("MISS %s - html" % bbl) outfile = "%s/%s-%s.html" % (dirpath, stype, bbl) ioany.save_lines(outfile, r.text) return 'miss' else: # We got some completely unexpected content type log.info("ERROR %s - bad content type '%s'" % (bbl, ctype)) return 'error'
parser.add_argument("--stash", required=False, type=str, help="stash directory", default="stash") parser.add_argument("--bounds", required=False, type=str, help="range tuple of the form start:limit") parser.add_argument("--targets", required=False, type=str, help="target list") parser.add_argument("--pace", required=False, type=str, help="tuple of the form N:k") args = parser.parse_args() log.info("args = %s" % args) def save_pdf(path, content): with open(path, "wb") as f: f.write(content) def process(pulldir, stype, pubdate, bbl): agent = Agent() dirpath = make_bbl_dir(pulldir, bbl) outfile = "%s/%s-%s.pdf" % (dirpath, stype, bbl) if os.path.exists(outfile): log.info("SKIP %s" % bbl) return 'skip' log.info("search %s .." % bbl)
def post(self, url, **kwargs): log.debug("url = %s" % url) r = self.s.post(url, **kwargs) log.info("POST r.status = %s" % r.status_code) log.debug("POST r.headers = %s" % r.headers) return r
def get(self, url, **kwargs): log.debug("url = %s" % url) r = self.s.get(url, **kwargs) log.info("GET status = %s" % r.status_code) log.debug("GET r.headers = %s" % r.headers) return r