Beispiel #1
0
    def cleanUS(self, tbl, dbBase=None, locVar=["City", "State", "Zipcode"], reset=False):
        #Congressional District + Zipcode
        if dbBase==None:
            dbBase = "/home/ron/disambig/geo/CD_ZIP.sqlite3"
        s = SQLite(db=self.db, tbl=self.tbl)
        s.attach(dbBase)

        locStr = ", ".join(locVar)
        locQStr = "=? AND ".join(locVar)+"=?"
        s.index(locVar)

        if 'lat1' not in s.columns(output=False) or reset:
            s.merge(key=[['lat1', 'latitude'], ['lng1', 'longitude'], ['CD1', 'CD'], ['State1', 'State']], on=[[locVar[2], 'Zipcode']], tableFrom='congdistZip', db='db')
            s.merge(key=[[locVar[2]+"2", 'Zipcode']], on=[[locVar[0], 'City'], [locVar[1], 'State']], tableFrom='USCities', db='db')
            s.merge(key=[['lat2', 'latitude'], ['lng2', 'longitude'], ['CD2', 'CD'], ['State2', 'State']], on=[[locVar[2]+'2', 'Zipcode']], tableFrom='congdistZip', db='db')
        
            s.add('lat', '')
            s.add('lng', '')
            s.add('CD', '')
            
            s.c.execute("UPDATE %s SET lat='', lng='', CD=''" % tbl)
            ##    HERE ARE MY ASSUMPTIONS TO PUT LNG/LAT INTO LOU'S GRANT FILE -- 944,549 total records
            ##      1. City, State match is more precise than Zipcode match (sometimes Zip is just wrong..) Use that as default -- (206,369) 21.8%
            ##      2. If City, State match doesn't happen, then I default to Zipcode match ... small (3,998) 0.4%
            ##      3. If CD, State match -- use Zipcode centroid.  (693,922) 73.5%
            ##      4. 1-3 not capturing anything BUT city is filled (I did a quick scan, these all basically look foreign, see CSV) (7,217) 0.7%
            ##      5. Organization labeled as "UNKNOWN" (without City, State) - (30,640) 3.3%
            ##      6. Blanks
            ##         a) create frequency table of Standized_Organization with Zipcodes.
            ##         b) check if organization exists in database.  If so, align it with most frequent Zipcode combo (1,356) 0.1%
            ##      7. Blank (non 6) (1007) 0.1%
            ##      8. UNKNOWN, Blank or "Foreign" -- Delete for now, although I have the CSV output saved as blankCode.csv (38,864) 
            ##      9. Remaining records: (905,685) 95.8%

        if s.c.execute("SELECT count(*) FROM %s WHERE lat='' or lat is null" % tbl).fetchone()[0]>0:
            #Update everything to reflect 2nd
            #print datetime.datetime.now()
            g = s.c.execute("SELECT lat2, lng2, State2, CD2, %s FROM %s GROUP BY %s" % (locStr, tbl, locStr)).fetchall()
            if len(g)>0:
                s.c.executemany("UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" % (tbl, locQStr), g)
            #If State,CD!= Take Lat1, Lng1 ... I trust the City, State combo more overall (not the Zipcode)
            #print datetime.datetime.now()
            g = s.c.execute("SELECT lat1, lng1, State1, CD1, %s FROM %s GROUP BY %s HAVING CD2='' or CD2 is null" % (locStr, tbl, locStr)).fetchall()
            if len(g)>0:
                s.c.executemany("UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" % (tbl, locQStr), g)
            #If State,CD= Take Lat1, Lng1
            #print datetime.datetime.now()
            g = s.c.execute("SELECT lat1, lng1, State1, CD1, %s FROM %s GROUP BY %s HAVING State1=State" % (locStr, tbl, locStr)).fetchall()
            if len(g)>0:
                s.c.executemany("UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" % (tbl, locQStr), g)

        s.close()
Beispiel #2
0
 def setKey(self, db, table="main"):
     s = self.s
     s.open()
     OrgDct = dict(s.c.execute("SELECT %s, %s2 FROM grp" % (self.fld, self.uqKey)).fetchall())
     s.close()
     t = SQLite(db)
     def OrgDctIt(x):
         if x in OrgDct:
             return OrgDct[x]
         else:
             return ""
     t.conn.create_function("OrgDct", 1, OrgDctIt)
     t.c.execute("UPDATE %s SET %s=OrgDct(%s)" % (table, self.uqKey, self.fld))
     t.conn.commit()
     t.close()
Beispiel #3
0
    def merge(self, keys, db=None, tbl="main"):
        s = self.s
        s.open()
        if len(keys[0])<13:
            keys = ["%s%0.12d" % (x[0], int(x[1:])) for x in keys]

        k1 = min(keys)
        for k in keys:
            s.c.execute("UPDATE grp SET %s2='%s' WHERE %s2='%s'" % (self.uqKey, k1, self.uqKey, k))
        s.conn.commit()
        s.close()
        if db!=None:
            t = SQLite(db)
            for k in keys:
                t.c.execute("UPDATE %s SET %s='%s' WHERE %s='%s'" % (tbl, self.uqKey, k1, self.uqKey, k))
            t.conn.commit()
            t.close()
Beispiel #4
0
    def cleanUS(self,
                tbl,
                dbBase=None,
                locVar=["City", "State", "Zipcode"],
                reset=False):
        #Congressional District + Zipcode
        if dbBase == None:
            dbBase = "/home/ron/disambig/geo/CD_ZIP.sqlite3"
        s = SQLite(db=self.db, tbl=self.tbl)
        s.attach(dbBase)

        locStr = ", ".join(locVar)
        locQStr = "=? AND ".join(locVar) + "=?"
        s.index(locVar)

        if 'lat1' not in s.columns(output=False) or reset:
            s.merge(key=[['lat1', 'latitude'], ['lng1', 'longitude'],
                         ['CD1', 'CD'], ['State1', 'State']],
                    on=[[locVar[2], 'Zipcode']],
                    tableFrom='congdistZip',
                    db='db')
            s.merge(key=[[locVar[2] + "2", 'Zipcode']],
                    on=[[locVar[0], 'City'], [locVar[1], 'State']],
                    tableFrom='USCities',
                    db='db')
            s.merge(key=[['lat2', 'latitude'], ['lng2', 'longitude'],
                         ['CD2', 'CD'], ['State2', 'State']],
                    on=[[locVar[2] + '2', 'Zipcode']],
                    tableFrom='congdistZip',
                    db='db')

            s.add('lat', '')
            s.add('lng', '')
            s.add('CD', '')

            s.c.execute("UPDATE %s SET lat='', lng='', CD=''" % tbl)
            ##    HERE ARE MY ASSUMPTIONS TO PUT LNG/LAT INTO LOU'S GRANT FILE -- 944,549 total records
            ##      1. City, State match is more precise than Zipcode match (sometimes Zip is just wrong..) Use that as default -- (206,369) 21.8%
            ##      2. If City, State match doesn't happen, then I default to Zipcode match ... small (3,998) 0.4%
            ##      3. If CD, State match -- use Zipcode centroid.  (693,922) 73.5%
            ##      4. 1-3 not capturing anything BUT city is filled (I did a quick scan, these all basically look foreign, see CSV) (7,217) 0.7%
            ##      5. Organization labeled as "UNKNOWN" (without City, State) - (30,640) 3.3%
            ##      6. Blanks
            ##         a) create frequency table of Standized_Organization with Zipcodes.
            ##         b) check if organization exists in database.  If so, align it with most frequent Zipcode combo (1,356) 0.1%
            ##      7. Blank (non 6) (1007) 0.1%
            ##      8. UNKNOWN, Blank or "Foreign" -- Delete for now, although I have the CSV output saved as blankCode.csv (38,864)
            ##      9. Remaining records: (905,685) 95.8%

        if s.c.execute("SELECT count(*) FROM %s WHERE lat='' or lat is null" %
                       tbl).fetchone()[0] > 0:
            #Update everything to reflect 2nd
            #print datetime.datetime.now()
            g = s.c.execute(
                "SELECT lat2, lng2, State2, CD2, %s FROM %s GROUP BY %s" %
                (locStr, tbl, locStr)).fetchall()
            if len(g) > 0:
                s.c.executemany(
                    "UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" %
                    (tbl, locQStr), g)
            #If State,CD!= Take Lat1, Lng1 ... I trust the City, State combo more overall (not the Zipcode)
            #print datetime.datetime.now()
            g = s.c.execute(
                "SELECT lat1, lng1, State1, CD1, %s FROM %s GROUP BY %s HAVING CD2='' or CD2 is null"
                % (locStr, tbl, locStr)).fetchall()
            if len(g) > 0:
                s.c.executemany(
                    "UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" %
                    (tbl, locQStr), g)
            #If State,CD= Take Lat1, Lng1
            #print datetime.datetime.now()
            g = s.c.execute(
                "SELECT lat1, lng1, State1, CD1, %s FROM %s GROUP BY %s HAVING State1=State"
                % (locStr, tbl, locStr)).fetchall()
            if len(g) > 0:
                s.c.executemany(
                    "UPDATE %s SET lat=?, lng=?, State=?, CD=? WHERE %s" %
                    (tbl, locQStr), g)

        s.close()