def findIncidentsForAllTweets(): cursor = cnxnMgr.getCursor() seen = [] cursor.execute( "select id, datetime, text from tweet where incidentNumber is null") for tweet in cursor.fetchall(): findIncidentForTweet(tweet)
def getOriginalDataForDate(single_date): cursor = cnxnMgr.getCursor() dateString = single_date.strftime("%m/%d/%Y") params = urllib.parse.urlencode( {"incDate": dateString, "rad1": "des" } ) url = "http://www2.seattle.gov/fire/realtime911/getRecsForDatePub.asp?" + params print (url) parser = etree.HTMLParser() tree = etree.parse(url, parser) root = tree.getroot() incidentRows = tree.xpath("//tr[@id]") # all table rows with id defined for incidentRow in reversed(incidentRows): item = incidentRow.xpath("td") datetime = item[0].text; incidentId = item[1].text; try: level = int(item[2].text); except: level = 1 units = item[3].text; location = item[4].text; type = item[5].text; print(incidentId) if not incidentId: # bad row - no idea what to do continue initialProcessForIncident(incidentId, datetime, level, units, location, type)
def backfill(): cursor = cnxnMgr.getCursor() start_date = date(2017, 6, 15) # need to run backfill for june 18 end_date = date.today() for single_date in daterange(start_date, end_date): print(single_date) places.checkLocationByDate(cursor, single_date)
def setStreet(): cursor = cnxnMgr.getCursor() # if place is not null # if contains a space-slash-space then separate these two parts into street and cross street # if starts with a number pull the number off and write the rest results = cursor.execute( "select * from location where place is not null and street_name is null" ).fetchall() # about 140,000 max - fits in memory # results = cursor.execute("select * from location where raw_location like'%/%' and street_name is null").fetchall() # about 140,000 max - fits in memory for row in results: location = row.raw_location if re.search(" [/] ", location.lower()): parts = location.split("/") if len(parts) == 2: cursor.execute( "update location set street_name = ?, cross_street = ? where id = ?", parts[0], parts[1], row.id) cursor.commit() elif re.search("[a-z] ?[/]", location.lower()): parts = location.split("/") if len(parts) == 2: cursor.execute( "update location set street_name = ?, cross_street = ? where id = ?", parts[0], parts[1], row.id) cursor.commit() elif re.match("\d+ av[ e]", location.lower()): # if a numbered ave then it's not a house number pass elif re.match("\d+ ", location): parts = location.split(" ", 1) if len(parts) == 2: cursor.execute( "update location set street_number = ?, street_name = ? where id = ?", parts[0], parts[1], row.id) cursor.commit() elif re.match("\d+-\d+ ", location): parts = location.split(" ", 1) if len(parts) == 2: numbers = parts[0].split("-", 1) if len(numbers) == 2: cursor.execute( "update location set street_number = ?, street_name = ? where id = ?", numbers[0], parts[1], row.id) cursor.commit() results = cursor.execute( "select * from location where place is not null and street_name like '- %'" ).fetchall() for row in results: street_name = row.street_name if re.match("- \d+ ", street_name): parts = street_name.split(" ", 2) if len(parts) == 3: cursor.execute( "update location set street_name = ? where id = ?", parts[2], row.id) cursor.commit()
def updateTweets(): cursor = cnxnMgr.getCursor() cursor.execute("select top 1 id from tweet order by datetime desc") for row in cursor.fetchall(): id = row[0] results = api.GetUserTimeline(screen_name="SeattleFire", count=200, since_id=id) for r in results: d = datetime.datetime.strptime(r.created_at, "%a %b %d %H:%M:%S %z %Y") u = utc_to_local(d) addTweet(r.id, r.text, u) print(r.id)
def initialProcessForIncident(incidentId, datetime, level, units, location, type): cursor = cnxnMgr.getCursor() writeIncident(cursor, incidentId, datetime, level) if units: writeUnits(cursor, units) writeIncidentUnits(cursor, incidentId, units) if type: writeType(cursor, type) writeIncidentType(cursor, incidentId, type) if location: if not doesLocationExist(cursor, location): loc = places.getLocationForAddress(location) writeLocation(cursor, location, loc) # split street names & numbers writeIncidentLocation(cursor, incidentId, location)
def checkForAv(): # one time method cursor = cnxnMgr.getCursor() results = cursor.execute( "select * from location where place is not null and lower(street_name) like 'av%'" ).fetchall() # about for row in results: location = row.raw_location if re.match("^\d+ av[ e//]", location.lower()): cursor.execute( "update location set place = null, street_number = null, street_name = null, cross_street = null where id = ?", row.id) cursor.commit() print(location) print(row.street_name) print()
def readRawData(): twitterCollector.updateTweets() # read all tweets since last one cursor = cnxnMgr.getCursor() cursor.execute("select top 1 datetime from incident order by datetime desc") # start_date = date(2003, 11, 7) - data start # start_date = date(2017, 7, 17) # restart - run Jul 1 for row in cursor.fetchall(): start_date = (row[0] + timedelta(hours=1)).date() break end_date = date.today() for single_date in daterange(start_date, end_date): getOriginalDataForDate(single_date) places.checkLocationByDate(cursor, single_date) twitterCollector.findIncidentsForAllTweets()
def getDetail(itemNumbers): output = {} cursor = getCursor() # guard against SQL injection for number in itemNumbers: if not re.match("[FBMVST]\d+$", number): return output for row in cursor.execute(""" select incident.number, incident.datetime, location.place.Lat, location.place.Long, IT.raw_type, IU.unit_name, location.raw_location, tweet.id from incident inner join incident_type as IT on incident.number = IT.incidentNumber inner join incident_location as IL on incident.number = IL.incidentNumber inner join incident_unit as IU on incident.number = IU.incidentNumber inner join location on IL.raw_location = location.raw_location left join tweet on tweet.incidentNumber = incident.number where incident.number in (%s) """ % ("'%s'" % "','".join(itemNumbers)) ): incidentNumber = row[0] incidentDateTime = row[1] incidentLat = row[2] incidentLong = row[3] incidentType = row[4] incidentUnit = row[5] rawLocation = row[6] tweetId = row[7] if not incidentNumber in output: output[incidentNumber] = {"unit":[], "tweet":[]} output[incidentNumber]["number"] = incidentNumber output[incidentNumber]["location"] = (incidentLat, incidentLong) output[incidentNumber]["datetime"] = incidentDateTime output[incidentNumber]["type"] = incidentType output[incidentNumber]["rawlocation"] = rawLocation if not incidentUnit in output[incidentNumber]["unit"]: output[incidentNumber]["unit"].append(incidentUnit) if tweetId and not tweetId in output[incidentNumber]["tweet"]: output[incidentNumber]["tweet"].append(tweetId) return output
def query(): args = request.args unit = args.getlist('unit') type = args.getlist('type') location = args.getlist('location') region = args.get('region', "") startdate = args.get('startdate', "2001-01-01") enddate = args.get('enddate', "2030-01-01") startdate = datetime.strptime(startdate, "%Y-%m-%d") enddate = datetime.strptime(enddate, "%Y-%m-%d").replace(hour=23, minute=59, second=59) # clean data input - reject all data if not in approved list if not set(type).issubset(alltypes()): type = [] if not set(unit).issubset(allunits()): unit = [] # the check for elements of region happens in queries.py # TODO: need to check the elements of location location = [] # just block location for now # no need to check dateRange - since converted to dates prevents SQL injection for retry in range(3): try: output = dumps(getIncidents(units=unit, types=type, locations=location, region=region, dateRange=(startdate, enddate)), default=json_serial) break except: output = "{}" cursor = getCursor(forced=True) traceback.print_exc() resp = Response(output, mimetype="application/json") resp.headers['Access-Control-Allow-Origin'] = '*' return resp
def lookForFalseMatches(): cursor = cnxnMgr.getCursor() cursor.execute( "select id, datetime, text, incidentNumber from tweet where incidentNumber is not null" ) for tweet in cursor.fetchall(): for incident in cursor.execute( """ select incident.number, incident.datetime, IT.raw_type, location.raw_location, location.street_number, location.street_name, location.cross_street from incident inner join incident_type as IT on incident.number = IT.incidentNumber inner join incident_location as IL on incident.number = IL.incidentNumber inner join location on IL.raw_location = location.raw_location where incident.number = ? """, tweet[3]): # pretty good check, but still misses typos in tweets like Genesse/Genesee or abbrevs Lk Wash/Lake Washington if not getFirstStreetContentWord( incident[5]).lower() in tweet[2].lower(): print(tweet[2]) print(incident[3]) print(incident[2]) print(getFirstStreetContentWord(incident[5])) removeIncidentFromTweet(tweet[0]) break
def getIncidents(units=[], types=[], locations=[], region="", dateRange=()): types = set(types) # remove duplicates cursor = getCursor() # uses AND for the list of units, but OR for other lists # only a single region or daterange accepted unitstring = "" for unit in units: dbName = "iu" + unit unitstring += "inner join incident_unit as " + dbName + " on incident.number = " + dbName + ".incidentNumber and " + dbName + ".unit_name = '" + unit + "' " type = "IT.raw_type in ('" + "\',\'".join( [t.replace("'", "''") for t in types]) + "')" if types else '1=1' location = "IL.raw_location in ('" + "\',\'".join( locations) + "')" if locations else '1=1' geoPrefix = "" geoBody = "1=1" if region: parts = region.split(",") if len(parts) == 4 and all(isFloat(i) for i in parts): lats = (parts[0], parts[2]) longs = (parts[1], parts[3]) geoPrefix = "DECLARE @g geography; SET @g = geography::STPolyFromText('POLYGON(({2} {0}, {2} {1}, {3} {1}, {3} {0}, {2} {0}))', 4326);".format( min(lats), max(lats), min(longs), max(longs)) geoBody = "@g.STContains(location.place) = 1" date = "1=1" if dateRange: date = "incident.datetime between '" + dateRange[0].isoformat( ) + "' and '" + dateRange[1].isoformat() + "'" output = { "incident": {}, "display": "all", "totals": { "type": {}, "unit": {}, "weekday": [0] * 7, "month": [0] * 13, "hour": [0] * 24, "year": {} } } # note: month zero will never happen - one-based print(""" {} select incident.number, incident.datetime, location.place.Lat, location.place.Long, IT.raw_type, IU.unit_name, location.raw_location from incident {} inner join incident_type as IT on incident.number = IT.incidentNumber inner join incident_location as IL on incident.number = IL.incidentNumber inner join incident_unit as IU on incident.number = IU.incidentNumber inner join location on IL.raw_location = location.raw_location where {} and {} and {} and {} """.format(geoPrefix, unitstring, type, location, geoBody, date)) partialDataLimit = 10000 i = 0 for row in cursor.execute(""" {} select incident.number, incident.datetime, location.place.Lat, location.place.Long, IT.raw_type, IU.unit_name, location.raw_location from incident {} inner join incident_type as IT on incident.number = IT.incidentNumber inner join incident_location as IL on incident.number = IL.incidentNumber inner join incident_unit as IU on incident.number = IU.incidentNumber inner join location on IL.raw_location = location.raw_location where {} and {} and {} and {} """.format(geoPrefix, unitstring, type, location, geoBody, date)): # for up to N return full data # for up to M return only lat/long # for more tham M randomly replace data so that a total of M items is returned incidentNumber = row[0] incidentDateTime = row[1] incidentLat = twiddle(row[2]) incidentLong = twiddle(row[3]) incidentType = row[4] incidentUnit = row[5] rawLocation = row[6] if not incidentNumber in output["incident"]: if i < partialDataLimit: output["incident"][incidentNumber] = { "location": (incidentLat, incidentLong), "number": incidentNumber } if not incidentType in output["totals"]["type"]: output["totals"]["type"][incidentType] = 1 else: output["totals"]["type"][incidentType] += 1 if not incidentDateTime.year in output["totals"]["year"]: output["totals"]["year"][incidentDateTime.year] = 1 else: output["totals"]["year"][incidentDateTime.year] += 1 output["totals"]["hour"][incidentDateTime.hour] += 1 output["totals"]["month"][ incidentDateTime.month] += 1 # note: using 1-based months output["totals"]["weekday"][incidentDateTime.weekday()] += 1 i += 1 # count distinct incidents to determine display type else: pass # output["incident"][incidentNumber]["unit"].append(incidentUnit) if not incidentUnit in output["totals"]["unit"]: output["totals"]["unit"][incidentUnit] = 1 else: output["totals"]["unit"][incidentUnit] += 1 # if not incidentUnit in output["totals"]["unit"]: # output["totals"]["unit"][incidentUnit] = 1 # else: # output["totals"]["unit"][incidentUnit] += 1 # if i > fullDataLimit: # output["display"] = "heatmap" return output
def removeIncidentFromTweet(tweetId): cursor = cnxnMgr.getCursor() cursor.execute("update tweet set incidentNumber = NULL where id = ?", tweetId) cursor.commit()
def assignIncidentToTweet(incidentNumber, tweetId): cursor = cnxnMgr.getCursor() cursor.execute("update tweet set incidentNumber = ? where id = ?", incidentNumber, tweetId) cursor.commit()
def findIncidentForTweet(tweet): cursor = cnxnMgr.getCursor() lower = tweet[1] + timedelta(hours=-24) upper = tweet[1] + timedelta(hours=1) for incident in cursor.execute( """ select incident.number, incident.datetime, IT.raw_type, IU.unit_name, location.raw_location, location.street_number, location.street_name, location.cross_street from incident inner join incident_type as IT on incident.number = IT.incidentNumber inner join incident_location as IL on incident.number = IL.incidentNumber inner join incident_unit as IU on incident.number = IU.incidentNumber inner join location on IL.raw_location = location.raw_location where incident.datetime between ? and ? """, lower, upper): # if house number and street content word in tweet if incident[5] and len(incident[5]) > 1 and incident[5] + ' ' in tweet[ 2] and incident[6] and getFirstStreetContentWord( incident[6]).lower() in tweet[2].lower().split(): print(incident[2]) print(incident[4]) print(tweet[2]) print() assignIncidentToTweet(incident[0], tweet[0]) break # if house number lines up with block in tweet and street content word in tweet elif incident[5] and len(incident[5]) > 2 and re.search( " " + incident[5][:-2] + '00' + " ?bl?o?c?k", tweet[2]) and incident[6] and getFirstStreetContentWord( incident[6]).lower() in tweet[2].lower().split(): print(incident[2]) print(incident[4]) print(tweet[2]) print() assignIncidentToTweet(incident[0], tweet[0]) break # if both the street and the cross street are in the tweet text based on distinct words elif incident[6] and incident[7] and getFirstStreetContentWord( incident[6]).lower() in tweet[2].lower().split( ) and getFirstStreetContentWord( incident[7]).lower() in tweet[2].lower().split(): # if incident[5] + ' ' in tweet[2]: print(incident[2]) print(incident[4]) print(tweet[2]) print() assignIncidentToTweet(incident[0], tweet[0]) break return # I'm not convinced about any of the matching approaches below for incident in cursor.execute( """ select incident.number, incident.datetime, IT.raw_type, IU.unit_name, location.raw_location, location.street_number, location.street_name, location.cross_street from incident inner join incident_type as IT on incident.number = IT.incidentNumber inner join incident_location as IL on incident.number = IL.incidentNumber inner join incident_unit as IU on incident.number = IU.incidentNumber inner join location on IL.raw_location = location.raw_location where incident.datetime between ? and ? """, lower, upper): # if house number and 5 char of street name in tweet # if both the street and the cross street are in the tweet text based on distinct words if not incident[0] in seen and incident[6] and incident[ 7] and getFirstStreetContentWord(incident[6]).lower( ) in tweet[2].lower().split() and getFirstStreetContentWord( incident[7]).lower() in tweet[2].lower().split(): # if incident[5] + ' ' in tweet[2]: print(incident[2]) print(incident[4]) print(tweet[2]) print() seen.append(incident[0]) assignIncidentToTweet(incident[0], tweet[0]) break return for incident in cursor.execute( """ select incident.number, incident.datetime, IT.raw_type, IU.unit_name, location.raw_location, location.street_number, location.street_name, location.cross_street from incident inner join incident_type as IT on incident.number = IT.incidentNumber inner join incident_location as IL on incident.number = IL.incidentNumber inner join incident_unit as IU on incident.number = IU.incidentNumber inner join location on IL.raw_location = location.raw_location where incident.datetime between ? and ? """, lower, upper): # if both the street and the cross street are in the tweet text based on longest substring if not incident[0] in seen and incident[6] and len( longest_common_substring( incident[6], tweet[2])) > 4 and incident[7] and len( longest_common_substring(incident[7], tweet[2])) > 4: # if incident[5] + ' ' in tweet[2]: print(incident[2]) print(incident[4]) print(tweet[2]) seen.append(incident[0]) assignIncidentToTweet(incident[0], tweet[0]) break for incident in cursor.execute( """ select incident.number, incident.datetime, IT.raw_type, IU.unit_name, location.raw_location, location.street_number, location.street_name, location.cross_street from incident inner join incident_type as IT on incident.number = IT.incidentNumber inner join incident_location as IL on incident.number = IL.incidentNumber inner join incident_unit as IU on incident.number = IU.incidentNumber inner join location on IL.raw_location = location.raw_location where incident.datetime between ? and ? and IU.unit_name = 'PIO' """, lower, upper): # since this is an incident the PIO responded to then we should be biased twoard accepting it if not incident[0] in seen and incident[6] and len( longest_common_substring(incident[6], tweet[2])) > 4: # if incident[5] + ' ' in tweet[2]: print(incident[4]) print(tweet[2]) seen.append(incident[0]) # assignIncidentToTweet(incident[0], tweet[0]) break for incident in cursor.execute( """ select incident.number, incident.datetime, IT.raw_type, IU.unit_name, location.raw_location, location.street_number, location.street_name, location.cross_street from incident inner join incident_type as IT on incident.number = IT.incidentNumber inner join incident_location as IL on incident.number = IL.incidentNumber inner join incident_unit as IU on incident.number = IU.incidentNumber inner join location on IL.raw_location = location.raw_location where incident.datetime between ? and ? """, lower, upper): # if house number and 5 char of street name in tweet if not incident[0] in seen and incident[5] and len( incident[5]) > 1 and incident[6] and len( longest_common_substring(incident[6], tweet[2])) > 4: if incident[5] + ' ' in tweet[2]: print(incident[4]) print(tweet[2]) seen.append(incident[0]) assignIncidentToTweet(incident[0], tweet[0]) break
def addTweet(id, text, datetime): cursor = cnxnMgr.getCursor() cursor.execute( "if not exists (select id from tweet where id = ?) insert into tweet(id, text, datetime) values (?, ?, ?)", id, id, text, datetime) cursor.commit()