def full_scan(): set_log(os.path.join("logs", "txt", "rent_control.txt")) log("Scanning rent_control.geojson", important=True) rcd = read(os.path.join("scrapers", "data", "rent_control.geojson"), isjson=True)["features"] rcp = [rc["properties"] for rc in rcd if rc["properties"]["address"]] log("using %s of %s rows (omitting blank entries)"%(len(rcp), len(rcd)), 1) blds = [] bset = set() for d in rcp: addr = d["address"] if addr not in bset: building = Building.query(Building.address == addr).get() if not building: log("Can't find '%s' -- creating new entry"%(addr,), 2) building = Building(address=addr) building.rent_control = True blds.append(building) bset.add(addr) log("saving %s rent-control buildings to db"%(len(blds),), 1) db.put_multi(blds) log("goodbye") close_log()
def propertyValues(): log("Analyzing Property Values", important=True) for year in range(2008, 2014): yobj = obj[year] tobj = yobj["total"] tobj["property_values"] = { "all": [] } log("parsing: %s"%(year,), 1) xls = getxls(read(path("scrapers", "data", "pvals", "SanFranciscoPropertyAssessmentRoll%s.xlsx"%(year,))), "%sSecured"%(year,), parse=False) for r in range(1, xls.nrows): row = xls.row(r) pval = row[7].value zipcode = row[8].value[:5] if zipcode not in yobj: yobj[zipcode] = { "fires": 0, "people": 0, "units": 0 } zobj = yobj[zipcode] if "property_values" not in zobj: zobj["property_values"] = { "all": [] } zobj["property_values"]["all"].append(pval) tobj["property_values"]["all"].append(pval) for zcode, zobj in yobj.items(): if "property_values" in zobj: apv = zobj["property_values"]["all"] ave = None log("analyzing: %s"%(zcode,), 2) for conversion, func in ARR_ANYL: zobj["property_values"][conversion] = func(apv, ave) ave = ave or zobj["property_values"][conversion] # average happens first log("%s: %s"%(conversion, zobj["property_values"][conversion]), 3) del zobj["property_values"]["all"] else: log("%s: missing!"%(zcode,), 2)
def processjs(path, jspaths, inits): block = read(path) for line in block.split("\n"): if line.startswith("CT.require(") and not line.endswith(", true);"): block = require(line, jspaths, block, inits) jspaths.append(path) return "%s;\n" % (block, )
def scan(etype): fname = "%s_1997_2015.xlsx"%(etype,) log("Scanning %s"%(fname,), important=True) xls = getxls(read(os.path.join("scrapers", "data", fname))) puts = [] for row in xls[1:]: puts.append(SCANROW[etype](row)) return puts
def acquire(self, url, path): fname = url.split("/").pop() fpath = os.path.join(path, fname) if os.path.exists(fpath): return read(fpath) log("acquiring: %s"%(url,)) data = fetch(url) write(data, fpath) return data
def parse(self): self.logger.info("parse") url = None for line in read("cron.yaml", True): if line.startswith(" url: "): url = line[7:].strip() elif url: self.timers[url] = Rule(self.controller, url, line[12:].strip()) url = None
def getWPmails(): import MySQLdb h, u, p, d = read(".c").strip().split("|") log("extracting email list from WP", 1) conn = MySQLdb.connect(host=h, user=u, passwd=p, db=d) cur = conn.cursor() cur.execute("SELECT user_email FROM wp_users") rowz = cur.fetchall() log("found %s recipients"%(len(rowz),), 1) return [r[0] for r in rowz]
def setmode(mode): doyaml(mode) # support other backends beyond app engine ctpy = read(config.py.path) isprod = mode == "production" # set encode ctpy = ctpy.replace(config.py.enc%(str(not isprod),), config.py.enc%(str(isprod),)) # set mode for m in ["dynamic", "static", "production"]: if m != mode: ctpy = ctpy.replace(config.py.mode%(m,), config.py.mode%(mode,)) write(ctpy, config.py.path)
def setmode(mode): doyaml(mode) # support other backends beyond app engine ctpy = read(config.py.path) isprod = mode == "production" # set encode ctpy = ctpy.replace(config.py.enc % (str(not isprod), ), config.py.enc % (str(isprod), )) # set mode for m in ["dynamic", "static", "production"]: if m != mode: ctpy = ctpy.replace(config.py.mode % (m, ), config.py.mode % (mode, )) write(ctpy, config.py.path)
def snap(): log("attempting snap", important=True) if not os.path.exists("archive"): log("creating archive directory") os.mkdir("archive") data = read("data.db", binary=True) match = indir(data, "archive") if match: log("matching archive: %s - aborting snap" % (match, )) return match aname = str(datetime.now()).rsplit(":", 1)[0].replace(" ", "_") cmd('cp data.db "%s"' % (os.path.join("archive", aname), )) return aname
def allUnits(): csv = getcsv_from_data(read(path("scrapers", "data", "BlockLot_with_LatLon.csv"))) buildings = {} winner = None for row in csv: addr = "%s %s %s"%(row[22], row[20], row[19]) if not addr.strip(): continue if addr not in buildings: buildings[addr] = 0 buildings[addr] += 1 if not winner or buildings[addr] > buildings[winner]: winner = addr return buildings, winner, len(csv)
def doyaml(mode): log("switching to %s mode" % (mode, )) lines = read(config.yaml.path, lines=True) f = open(config.yaml.path, 'w') m = None for line in lines: if line.startswith(config.yaml.start): m = line[len(config.yaml.start):].strip() elif line.startswith(config.yaml.end): m = None elif m == mode: line = line.strip("#") elif m: line = "#%s" % (line.strip("#"), ) f.write(line) f.close()
def doyaml(mode): log("switching to %s mode"%(mode,)) lines = read(config.yaml.path, lines=True) f = open(config.yaml.path, 'w') m = None for line in lines: if line.startswith(config.yaml.start): m = line[len(config.yaml.start):].strip() elif line.startswith(config.yaml.end): m = None elif m == mode: line = line.strip("#") elif m: line = "#%s"%(line.strip("#"),) f.write(line) f.close()
def pretex(doc, fname, fontonly=False): fcfg = config.ctman.font pname = os.path.join("build", "%s.tex" % (fname, )) if not fontonly and doc.logo: iname = symage(doc.logo.path) fontdesc = fcfg.family and FONTFAM % (fcfg.family, fcfg.family) or "" write( fontonly and fontdesc or read("tex/pre.tex").replace( "_CLIENT_LOGO_", doc.logo and iname or "img/logo.jpg").replace( "_DECLARATION_PAGE_", doc.declaration_page and dex(doc) or "").replace( "_SIGNUP_SHEET_", doc.signup_sheet and SUSHEET or "").replace("_DOC_NAME_", doc.name).replace( "_DOC_REVISION_", str(doc.revision)).replace( "_DOC_FONT_", fontdesc), pname) return pname
def build(nothing, dirname, fnames): """ This parses an html file, squishes together the javascript, scans through for dynamic imports (CT.require statements), injects modules wherever necessary, and sticks the result in a big <script> tag. """ if ".svn" in dirname: return for bd in config.build.compiled_dirs.values(): checkdir(bd) for fname in bfiles(dirname, fnames): for mode, compdir in config.build.compiled_dirs.items(): fulldir = dirname.replace(config.build.dynamic_dir, compdir) frompath = os.path.join(dirname, fname) topath = os.path.join(fulldir, fname) data = read(frompath) log('building: %s -> %s' % (frompath, topath), important=True) checkdir(fulldir) if "fonts" in dirname or not fname.endswith(".html"): log('copying non-html file', 1) else: txt, js = processhtml(data) if js: jspaths, jsblock = compilejs(js) if mode is "static": log("static mode", 1) js = '\n'.join([ p.endswith("js") and '<script src="%s"></script>' % (p, ) or '<script>%s</script>' % (p, ) for p in jspaths ]) elif mode is "production": log("production mode", 1) txt = compress(txt) from slimit import minify js = "<script>%s</script>" % (minify(jsblock.replace( '"_encode": false,', '"_encode": true,').replace( "CT.log._silent = false;", "CT.log._silent = true;"), mangle=True), ) else: error("invalid mode: %s" % (mode, )) data = txt.format(jsspot=js) else: data = txt write(data, topath)
def getRentControl(): if not len(list(rcset)): import os from cantools.util import read, log log("Parsing Rent Control Data", important=True) rcd = read(os.path.join("scrapers", "data", "rent_control.geojson"), isjson=True)["features"] for z in zmap: zmap[z]["rent_control"] = 0 rcp = [rc["properties"] for rc in rcd if rc["properties"]["address"]] log("using %s of %s rows (omitting blank entries)"%(len(rcp), len(rcd)), 1) for d in rcp: zc = d["zipcode"] rcset.add(d["address"]) if not zc in zmap: zmap[zc] = { "rent_control": 0, "ad_hoc": True } zmap[zc]["rent_control"] += 1 zmap["SF"]["rent_control"] += 1 return rcset
def __init__(self): self.cache = read("%s.json"%(zcpath,), isjson=True, default={}) for addr in self.cache: self.cache[addr]["count"] = 0
def part(fname): return "# %s\n%s" % (fname.split(".")[0], read(os.path.join("templates", fname)))
def __init__(self): self.cache = read("%s.json" % (zcpath, ), isjson=True, default={}) for addr in self.cache: self.cache[addr]["count"] = 0
38: [["\n\n 1\n\n 5\n\n", "\n\n"]], 43: [["\n\n 1\n\n 1\n\n", "\n\n"]], 45: [["\n\n 1\n\n", "\n\n"]], 47: [["\n\n 5\n\n 1\n\n", "\n\n"]], 51: [["\n\n 1\n\n", "\n\n"]], 23: [["\nB02\nB10\nB10\nB09\nB06\nB06\nB03\nB06\nB01\nB09\nB06\nB08\nB06\nB10\nB06\nB01\nB01\nB10\nB04\nB03\nB04\nB06\nB02\nB10\nB06\nB08\nB04\nB07\nB02\nB10\nB09\nB08\nB10\nB02\nB04\nB06\nB04\nB05\nB10\nB02\nB09\n\n94124\n94124\n94112\n94114\n94110\n94130\n94110\n94133\n94112\n94110\n94122\n94131\n\n94114\n94111\n94102\n94124\n94123\n94104\n94109\n94110\n94114\n94107\n\n94116\n94115\n94121\n94110\n94124\n94112\n94122\n94124\n94103\n94115\n94131\n94109\n94117\n94124\n94102\n94112\n", "\n94124\n94124\n94112\n94114\n94110\n94130\n94110\n94133\n94112\n94110\n94122\n94131\n\n94114\n94111\n94102\n94124\n94123\n94104\n94109\n94110\n94114\n94107\n\n94116\n94115\n94121\n94110\n94124\n94112\n94122\n94124\n94103\n94115\n94131\n94109\n94117\n94124\n94102\n94112\n\nB02\nB10\nB10\nB09\nB06\nB06\nB03\nB06\nB01\nB09\nB06\nB08\nB06\nB10\nB06\nB01\nB01\nB10\nB04\nB03\nB04\nB06\nB02\nB10\nB06\nB08\nB04\nB07\nB02\nB10\nB09\nB08\nB10\nB02\nB04\nB06\nB04\nB05\nB10\nB02\nB09\n"]], 32: [['\n1267 PAGE ST\n333 MONTICELLO ST\n1568 HAIGHT ST\n\n94117\n94132\n94117\n', '\n\n1267 PAGE ST\n94117\n333 MONTICELLO ST\n94132\n1568 HAIGHT ST\n94117\n'], ["\n\n 2\n\n 2\n\n 1\n\n", "\n\n"]] } STEST = False config.geo.test = STEST # happens automatically based on ct.cfg unless scraper is run independent of web server pbase = STEST and ".." or "." def path(*components): return os.path.join(pbase, *components) data = read(path("scrapers", "data", "fires.txt")) pages = [p.split(PBRK)[1] for p in data.split("\x0c") if PBRK in p] def getDates(page): datelines, page = page.split("\n\n", 1) dates = [] for dateline in datelines.split("\n"): if "/" in dateline: ds = [int(d) for d in dateline.split(" ")[-1].split("/")] dates.append(datetime(ds[2], ds[0], ds[1])) log("got %s dates"%(len(dates),), 1) return dates, page def a2z(addr): return addr2zip("%s, san francisco, CA"%(addr,))