Beispiel #1
0
def perform(dirname, stype, convert=None, limit=None):
    log.info("limit = %s" % limit)
    log.info("dirname = %s" % dirname)
    status = defaultdict(list)
    pairs = walkdir(dirname, stype, ['pdf'])
    pairs = islice(pairs, limit)
    for dirpath, fname in pairs:
        _stype, bbl, ext = split_fname(fname)
        if _stype != stype:
            raise ValueError("stype mismatch")
        log.debug("stype,bbl = %s,%d .." % (_stype, bbl))
        status['seen'] += [bbl]
        infile = "%s/%s" % (dirpath, fname)
        outfile = "%s/%s-%d.txt" % (dirpath, stype, bbl)
        if os.path.exists(outfile):
            log.info("SKIP %d" % bbl)
            status['skip'] += [bbl]
            continue
        try:
            convert(infile, outfile)
            log.info("GOOD %d" % bbl)
            status['good'] += [bbl]
        except Exception as e:
            log.info("FAIL %d %s" % (bbl, e))
            log.exception(e)
            status['fail'] += [bbl]
    return status
Beispiel #2
0
def process_target(pulldir, stype, bbl):
    subdir = bbl2dirpath(bbl)
    fname = "%s-%d.pdf" % (stype, bbl)
    infile = "%s/%s/%s" % (pulldir, subdir, fname)
    log.info("infile = %s" % infile)
    if os.path.exists(infile):
        with open(infile, "rb") as f:
            return parse(f)
    else:
        return None
Beispiel #3
0
def inventory(dirname, stype):
    """Make a quick inventory of which BBLs we have .pdf/.txt files for."""
    log.info('..')
    seen = defaultdict(set)
    extset = ('pdf', 'txt', 'html')
    pairs = walkdir(dirname, stype, extset)
    for dirpath, fname in pairs:
        stype, bbl, ext = split_fname(fname)
        seen[ext].add(bbl)
    return seen
Beispiel #4
0
def init_targets(d, args):
    if args.targets:
        targetsfile = args.targets
    else:
        targetsfile = "%s/targets.txt" % d['meta']
    print("targets from '%s' .." % targetsfile)
    targets = list(read_ints(targetsfile))
    targets = refine(targets, args)
    print("that be %d targets." % len(targets))
    log.info("that be %d targets." % len(targets))
    return targets
Beispiel #5
0
def rename_target(pulldir, stype, bbl):
    c = Counter()
    subdir = bbl2dirpath(bbl)
    pathbase = "%s/%s/%s-%d" % (pulldir, subdir, stype, bbl)
    oldpath = "%s.pdf" % pathbase
    newpath = "%s.html" % pathbase
    if os.path.exists(oldpath):
        log.info("MOVE oldpath = %s" % oldpath)
        os.rename(oldpath, newpath)
        c['move'] += 1
    else:
        log.info("SKIP oldpath = %s" % oldpath)
        c['skip'] += 1
    return c
Beispiel #6
0
def run(spec, targets):
    log.info("..")
    t0 = time.time()
    x = dispatch(d['pull'], args.spec, targets)
    delta = time.time() - t0
    tally = {k: len(v) for k, v in x.items()}
    log.info("done in %.3f sec" % delta)
    log.info("tally = %s" % tally)
Beispiel #7
0
def refine(targets, args):
    """Create a refined list of targets, according to argument flags."""
    bounds = parsetup(args.bounds) if args.bounds else None
    if bounds:
        i, j = bounds
        log.info("that be %d raw targets; restricting by range .." %
                 len(targets))
        targets = targets[i:j]
    pace = parsetup(args.pace) if args.pace else None
    if pace:
        log.info("that be %d raw targets; restricting by pace .." %
                 len(targets))
        targets = list(modslice(targets, *pace))
    log.info("that be %d targets" % len(targets))
    return targets
Beispiel #8
0
 def _walkdir():
     for i, bbl in enumerate(targets):
         log.debug("bbl = %s" % bbl)
         try:
             d = process_target(pulldir, stype, bbl)
         except Exception as e:
             log.info("FAIL %s = %s" % (bbl, e))
             log.exception(e)
             status['fail'] += [bbl]
             continue
         if d:
             log.info("GOOD %s" % bbl)
             status['good'] += [bbl]
             yield from expand(bbl, d)
         else:
             log.info("MISS %s" % bbl)
             status['miss'] += [bbl]
Beispiel #9
0
def purge_target(pulldir, stype, bbl):
    c = Counter()
    subdir = bbl2dirpath(bbl)
    pathbase = "%s/%s/%s-%d" % (pulldir, subdir, stype, bbl)
    checkpath = "%s.html" % pathbase
    if os.path.exists(checkpath):
        badpath = "%s.txt" % pathbase
        if os.path.exists(badpath):
            os.remove(badpath)
            log.info("KILL badpath = %s" % badpath)
            c['kill'] += 1
        else:
            log.info("SKIP badpath = %s" % badpath)
            c['skip-txt'] += 1
    else:
        log.info("SKIP checkpath = %s" % checkpath)
        c['skip-html'] += 1
    return c
Beispiel #10
0
def rescue_target(pulldir, stype, bbl):
    c = Counter()
    subdir = bbl2dirpath_short(bbl)
    extlist = ['pdf', 'txt', 'html']
    for ext in extlist:
        oldpath = "%s/%s/%d.%s" % (pulldir, subdir, bbl, ext)
        if os.path.exists(oldpath):
            log.info("PUSH oldpath = %s" % oldpath)
            newdir = make_bbl_dir(pulldir, bbl)
            newfile = "%s-%d.%s" % (stype, bbl, ext)
            newpath = "%s/%s" % (newdir, newfile)
            log.info("DEST newpath = %s" % newpath)
            os.rename(oldpath, newpath)
            c['push'] += 1
        else:
            log.info("SKIP oldpath = %s" % oldpath)
            c['skip'] += 1
    return c
Beispiel #11
0
def extract_unitcount(page):
    """Extracts what appears to be the stabilized unit count from a given page.
    Our current hypothesis is that whenever string appears directly after the
    label 'Housing-Rent Stabilization', -and- this string looks like an integer,
    then that's the unit count (and this number will be the same regardless of
    how many times such a match occurs throughout the page).
    If more than one match is detected, we simply emit the first matching value
    and make a note of this weirdness in the logs."""
    rawvals = list(yield_after(page, 'Housing-Rent Stabilization'))
    log.info("RAW %s" % rawvals)
    counts = [int(_) for _ in rawvals if re.match(pat['integer'], _)]
    if len(counts) < 1:
        if rawvals:
            # Means we get datefield entries, but no actual unit counts.
            # Seems to happen only on a small number of lots with but past-due charges
            # but apparently no longer having stabilized units.
            log.info("WEIRD rawvals = %s but none are integer" % rawvals)
        # Either way, interpret as having no units.
        return None
    if len(counts) > 1:
        # Happens occasionally for lots with multiple buildings.
        log.info("WEIRD multiple unitcount values %s" % counts)
        return sum(counts)
    return counts[0]
Beispiel #12
0
parser.add_argument("--random",
                    required=False,
                    action="store_true",
                    help="randomize targets")
parser.add_argument("--targets",
                    required=False,
                    type=str,
                    help="explicit targets list")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--check", action="store_true", help="check")
group.add_argument("--parse", action="store_true", help="parse")
group.add_argument("--rescue", action="store_true", help="rescue")
group.add_argument("--movepdfs", action="store_true", help="rescue")
# parser.add_argument("--loud", required=False, action="store_true", help="emit more data")
args = parser.parse_args()
log.info("args = %s" % args)
print("pid = %s" % os.getpid())


def init_targets(d, args):
    if args.targets:
        targetsfile = args.targets
    else:
        targetsfile = "%s/targets.txt" % d['meta']
    print("targets from '%s' .." % targetsfile)
    targets = list(read_ints(targetsfile))
    targets = refine(targets, args)
    print("that be %d targets." % len(targets))
    log.info("that be %d targets." % len(targets))
    return targets
Beispiel #13
0
def process(pulldir, stype, pubdate, bbl):
    agent = Agent()
    dirpath = make_bbl_dir(pulldir, bbl)
    outfile = "%s/%s-%s.pdf" % (dirpath, stype, bbl)
    if os.path.exists(outfile):
        log.info("SKIP %s" % bbl)
        return 'skip'
    log.info("search %s .." % bbl)
    r = agent.search(bbl)
    if r.status_code != 200:
        log.info("FAIL %s" % bbl)
        return 'fail'
    log.info("grab %s .." % bbl)
    r = agent.grab(bbl, pubdate, stype)
    if r is None:
        log.info("FAIL exception - %s" % bbl)
        return 'fail'
    ctype = r.headers.get('Content-Type')
    log.info("grab.content-type %s = %s" % (bbl, ctype))
    if r.status_code != 200:
        log.info("FAIL %d - %s" % (r.status_code, bbl))
        return 'fail'
    if ctype is None:
        log.info("ERROR %s - bad content type '%s'" % (bbl, ctype))
        return 'error'
    if ctype.startswith('application/pdf'):
        log.info("GOOD %s - pdf" % bbl)
        save_pdf(outfile, r.content)
        return 'good'
    elif ctype.startswith('text/html'):
        log.info("MISS %s - html" % bbl)
        outfile = "%s/%s-%s.html" % (dirpath, stype, bbl)
        ioany.save_lines(outfile, r.text)
        return 'miss'
    else:
        # We got some completely unexpected content type
        log.info("ERROR %s - bad content type '%s'" % (bbl, ctype))
        return 'error'
Beispiel #14
0
parser.add_argument("--stash",
                    required=False,
                    type=str,
                    help="stash directory",
                    default="stash")
parser.add_argument("--bounds",
                    required=False,
                    type=str,
                    help="range tuple of the form start:limit")
parser.add_argument("--targets", required=False, type=str, help="target list")
parser.add_argument("--pace",
                    required=False,
                    type=str,
                    help="tuple of the form N:k")
args = parser.parse_args()
log.info("args = %s" % args)


def save_pdf(path, content):
    with open(path, "wb") as f:
        f.write(content)


def process(pulldir, stype, pubdate, bbl):
    agent = Agent()
    dirpath = make_bbl_dir(pulldir, bbl)
    outfile = "%s/%s-%s.pdf" % (dirpath, stype, bbl)
    if os.path.exists(outfile):
        log.info("SKIP %s" % bbl)
        return 'skip'
    log.info("search %s .." % bbl)
Beispiel #15
0
 def post(self, url, **kwargs):
     log.debug("url = %s" % url)
     r = self.s.post(url, **kwargs)
     log.info("POST r.status = %s" % r.status_code)
     log.debug("POST r.headers = %s" % r.headers)
     return r
Beispiel #16
0
 def get(self, url, **kwargs):
     log.debug("url = %s" % url)
     r = self.s.get(url, **kwargs)
     log.info("GET status = %s" % r.status_code)
     log.debug("GET r.headers = %s" % r.headers)
     return r