예제 #1
0
def findIncidentsForAllTweets():
    cursor = cnxnMgr.getCursor()
    seen = []
    cursor.execute(
        "select id, datetime, text from tweet where incidentNumber is null")
    for tweet in cursor.fetchall():
        findIncidentForTweet(tweet)
예제 #2
0
def getOriginalDataForDate(single_date):
    cursor = cnxnMgr.getCursor()
    dateString = single_date.strftime("%m/%d/%Y")
    params = urllib.parse.urlencode( {"incDate": dateString, "rad1": "des" } )
    url = "http://www2.seattle.gov/fire/realtime911/getRecsForDatePub.asp?" + params
    print (url)

    parser = etree.HTMLParser()
    tree = etree.parse(url, parser)
    root = tree.getroot()

    incidentRows = tree.xpath("//tr[@id]") # all table rows with id defined
    for incidentRow in reversed(incidentRows):
        item = incidentRow.xpath("td")
        datetime = item[0].text;
        incidentId = item[1].text;
        try:
            level = int(item[2].text);
        except:
            level = 1
        units = item[3].text;
        location = item[4].text;
        type = item[5].text;
        print(incidentId)
        if not incidentId: # bad row - no idea what to do
            continue
        initialProcessForIncident(incidentId, datetime, level, units, location, type)
예제 #3
0
def backfill():
    cursor = cnxnMgr.getCursor()
    
    start_date = date(2017, 6, 15) # need to run backfill for june 18
    end_date = date.today()
    for single_date in daterange(start_date, end_date):
        print(single_date)
        places.checkLocationByDate(cursor, single_date)
예제 #4
0
def setStreet():
    cursor = cnxnMgr.getCursor()
    # if place is not null
    # if contains a space-slash-space then separate these two parts into street and cross street
    # if starts with a number pull the number off and write the rest
    results = cursor.execute(
        "select * from location where place is not null and street_name is null"
    ).fetchall()  # about 140,000 max - fits in memory
    #    results = cursor.execute("select * from location where raw_location like'%/%' and street_name is null").fetchall() # about 140,000 max - fits in memory
    for row in results:
        location = row.raw_location
        if re.search(" [/] ", location.lower()):
            parts = location.split("/")
            if len(parts) == 2:
                cursor.execute(
                    "update location set street_name = ?, cross_street = ? where id = ?",
                    parts[0], parts[1], row.id)
                cursor.commit()
        elif re.search("[a-z] ?[/]", location.lower()):
            parts = location.split("/")
            if len(parts) == 2:
                cursor.execute(
                    "update location set street_name = ?, cross_street = ? where id = ?",
                    parts[0], parts[1], row.id)
                cursor.commit()
        elif re.match("\d+ av[ e]", location.lower()):
            # if a numbered ave then it's not a house number
            pass
        elif re.match("\d+ ", location):
            parts = location.split(" ", 1)
            if len(parts) == 2:
                cursor.execute(
                    "update location set street_number = ?, street_name = ? where id = ?",
                    parts[0], parts[1], row.id)
                cursor.commit()
        elif re.match("\d+-\d+ ", location):
            parts = location.split(" ", 1)
            if len(parts) == 2:
                numbers = parts[0].split("-", 1)
                if len(numbers) == 2:
                    cursor.execute(
                        "update location set street_number = ?, street_name = ? where id = ?",
                        numbers[0], parts[1], row.id)
                    cursor.commit()

    results = cursor.execute(
        "select * from location where place is not null and street_name like '- %'"
    ).fetchall()
    for row in results:
        street_name = row.street_name
        if re.match("- \d+ ", street_name):
            parts = street_name.split(" ", 2)
            if len(parts) == 3:
                cursor.execute(
                    "update location set street_name = ? where id = ?",
                    parts[2], row.id)
                cursor.commit()
예제 #5
0
def updateTweets():
    cursor = cnxnMgr.getCursor()
    cursor.execute("select top 1 id from tweet order by datetime desc")
    for row in cursor.fetchall():
        id = row[0]
        results = api.GetUserTimeline(screen_name="SeattleFire",
                                      count=200,
                                      since_id=id)
        for r in results:
            d = datetime.datetime.strptime(r.created_at,
                                           "%a %b %d %H:%M:%S %z %Y")
            u = utc_to_local(d)
            addTweet(r.id, r.text, u)
            print(r.id)
예제 #6
0
def initialProcessForIncident(incidentId, datetime, level, units, location, type):
    cursor = cnxnMgr.getCursor()
    writeIncident(cursor, incidentId, datetime, level)
    if units:
        writeUnits(cursor, units)
        writeIncidentUnits(cursor, incidentId, units)
    if type:
        writeType(cursor, type)
        writeIncidentType(cursor, incidentId, type)
    if location:
        if not doesLocationExist(cursor, location):
            loc = places.getLocationForAddress(location)
            writeLocation(cursor, location, loc)
            # split street names & numbers
        writeIncidentLocation(cursor, incidentId, location)
예제 #7
0
def checkForAv():
    # one time method
    cursor = cnxnMgr.getCursor()
    results = cursor.execute(
        "select * from location where place is not null and lower(street_name) like 'av%'"
    ).fetchall()  # about
    for row in results:
        location = row.raw_location
        if re.match("^\d+ av[ e//]", location.lower()):
            cursor.execute(
                "update location set place = null, street_number = null, street_name = null, cross_street = null where id = ?",
                row.id)
            cursor.commit()
            print(location)
            print(row.street_name)
            print()
예제 #8
0
def readRawData():
    twitterCollector.updateTweets()
    # read all tweets since last one
    cursor = cnxnMgr.getCursor()
    cursor.execute("select top 1 datetime from incident order by datetime desc")

#    start_date = date(2003, 11, 7) - data start
#    start_date = date(2017, 7, 17) # restart - run Jul 1
    for row in cursor.fetchall():
        start_date = (row[0] + timedelta(hours=1)).date()
        break
    end_date = date.today()
    for single_date in daterange(start_date, end_date):
        getOriginalDataForDate(single_date)
        places.checkLocationByDate(cursor, single_date)
        
    twitterCollector.findIncidentsForAllTweets()
예제 #9
0
def getDetail(itemNumbers):
    output = {}
    cursor = getCursor()

    # guard against SQL injection
    for number in itemNumbers:
        if not re.match("[FBMVST]\d+$", number):
            return output

    for row in cursor.execute("""
        select incident.number, incident.datetime, location.place.Lat, location.place.Long, IT.raw_type, IU.unit_name, location.raw_location, tweet.id from incident
        inner join incident_type as IT on incident.number = IT.incidentNumber
        inner join incident_location as IL on incident.number = IL.incidentNumber
        inner join incident_unit as IU on incident.number = IU.incidentNumber
        inner join location on IL.raw_location = location.raw_location
        left join tweet on tweet.incidentNumber = incident.number
        where
        incident.number in (%s)
        """ % ("'%s'" % "','".join(itemNumbers))
        ):
        incidentNumber = row[0]
        incidentDateTime = row[1]
        incidentLat = row[2]
        incidentLong = row[3]
        incidentType = row[4]
        incidentUnit = row[5]
        rawLocation = row[6]
        tweetId = row[7]
        if not incidentNumber in output:
            output[incidentNumber] = {"unit":[], "tweet":[]}
            output[incidentNumber]["number"] = incidentNumber
            output[incidentNumber]["location"] = (incidentLat, incidentLong)
            output[incidentNumber]["datetime"] = incidentDateTime
            output[incidentNumber]["type"] = incidentType
            output[incidentNumber]["rawlocation"] = rawLocation
        if not incidentUnit in output[incidentNumber]["unit"]:
            output[incidentNumber]["unit"].append(incidentUnit)
        if tweetId and not tweetId in output[incidentNumber]["tweet"]:
            output[incidentNumber]["tweet"].append(tweetId)

    return output
예제 #10
0
def query():
    args = request.args
    unit = args.getlist('unit')
    type = args.getlist('type')
    location = args.getlist('location')
    region = args.get('region', "")
    startdate = args.get('startdate', "2001-01-01")
    enddate = args.get('enddate', "2030-01-01")

    startdate = datetime.strptime(startdate, "%Y-%m-%d")
    enddate = datetime.strptime(enddate, "%Y-%m-%d").replace(hour=23,
                                                             minute=59,
                                                             second=59)
    # clean data input - reject all data if not in approved list
    if not set(type).issubset(alltypes()):
        type = []
    if not set(unit).issubset(allunits()):
        unit = []
    # the check for elements of region happens in queries.py
    # TODO: need to check the elements of location
    location = []  # just block location for now
    # no need to check dateRange - since converted to dates prevents SQL injection
    for retry in range(3):
        try:
            output = dumps(getIncidents(units=unit,
                                        types=type,
                                        locations=location,
                                        region=region,
                                        dateRange=(startdate, enddate)),
                           default=json_serial)
            break
        except:
            output = "{}"
            cursor = getCursor(forced=True)
            traceback.print_exc()

    resp = Response(output, mimetype="application/json")
    resp.headers['Access-Control-Allow-Origin'] = '*'
    return resp
예제 #11
0
def lookForFalseMatches():
    cursor = cnxnMgr.getCursor()
    cursor.execute(
        "select id, datetime, text, incidentNumber from tweet where incidentNumber is not null"
    )
    for tweet in cursor.fetchall():
        for incident in cursor.execute(
                """
        select incident.number, incident.datetime, IT.raw_type, location.raw_location, location.street_number, location.street_name, location.cross_street from incident
        inner join incident_type as IT on incident.number = IT.incidentNumber
        inner join incident_location as IL on incident.number = IL.incidentNumber
        inner join location on IL.raw_location = location.raw_location
        where incident.number = ?
        """, tweet[3]):
            # pretty good check, but still misses typos in tweets like Genesse/Genesee or abbrevs Lk Wash/Lake Washington
            if not getFirstStreetContentWord(
                    incident[5]).lower() in tweet[2].lower():
                print(tweet[2])
                print(incident[3])
                print(incident[2])
                print(getFirstStreetContentWord(incident[5]))
                removeIncidentFromTweet(tweet[0])
                break
예제 #12
0
def getIncidents(units=[], types=[], locations=[], region="", dateRange=()):
    types = set(types)  # remove duplicates
    cursor = getCursor()
    # uses AND for the list of units, but OR for other lists
    # only a single region or daterange accepted

    unitstring = ""
    for unit in units:
        dbName = "iu" + unit
        unitstring += "inner join incident_unit as " + dbName + " on incident.number = " + dbName + ".incidentNumber and " + dbName + ".unit_name = '" + unit + "' "
    type = "IT.raw_type in ('" + "\',\'".join(
        [t.replace("'", "''") for t in types]) + "')" if types else '1=1'
    location = "IL.raw_location in ('" + "\',\'".join(
        locations) + "')" if locations else '1=1'

    geoPrefix = ""
    geoBody = "1=1"
    if region:
        parts = region.split(",")
        if len(parts) == 4 and all(isFloat(i) for i in parts):
            lats = (parts[0], parts[2])
            longs = (parts[1], parts[3])
            geoPrefix = "DECLARE @g geography; SET @g = geography::STPolyFromText('POLYGON(({2} {0}, {2} {1}, {3} {1}, {3} {0}, {2} {0}))', 4326);".format(
                min(lats), max(lats), min(longs), max(longs))
            geoBody = "@g.STContains(location.place) = 1"
    date = "1=1"
    if dateRange:
        date = "incident.datetime between '" + dateRange[0].isoformat(
        ) + "' and '" + dateRange[1].isoformat() + "'"

    output = {
        "incident": {},
        "display": "all",
        "totals": {
            "type": {},
            "unit": {},
            "weekday": [0] * 7,
            "month": [0] * 13,
            "hour": [0] * 24,
            "year": {}
        }
    }  # note: month zero will never happen - one-based

    print("""
        {}
        select incident.number, incident.datetime, location.place.Lat, location.place.Long, IT.raw_type, IU.unit_name, location.raw_location from incident
        {}
        inner join incident_type as IT on incident.number = IT.incidentNumber
        inner join incident_location as IL on incident.number = IL.incidentNumber
        inner join incident_unit as IU on incident.number = IU.incidentNumber
        inner join location on IL.raw_location = location.raw_location
        where
        {}
        and  {}
        and  {}
        and  {}
        """.format(geoPrefix, unitstring, type, location, geoBody, date))
    partialDataLimit = 10000
    i = 0
    for row in cursor.execute("""
        {}
        select incident.number, incident.datetime, location.place.Lat, location.place.Long, IT.raw_type, IU.unit_name, location.raw_location from incident
        {}
        inner join incident_type as IT on incident.number = IT.incidentNumber
        inner join incident_location as IL on incident.number = IL.incidentNumber
        inner join incident_unit as IU on incident.number = IU.incidentNumber
        inner join location on IL.raw_location = location.raw_location
        where
        {}
        and  {}
        and  {}
        and  {}
        """.format(geoPrefix, unitstring, type, location, geoBody, date)):
        # for up to N return full data
        # for up to M return only lat/long
        # for more tham M randomly replace data so that a total of M items is returned
        incidentNumber = row[0]
        incidentDateTime = row[1]
        incidentLat = twiddle(row[2])
        incidentLong = twiddle(row[3])
        incidentType = row[4]
        incidentUnit = row[5]
        rawLocation = row[6]
        if not incidentNumber in output["incident"]:
            if i < partialDataLimit:
                output["incident"][incidentNumber] = {
                    "location": (incidentLat, incidentLong),
                    "number": incidentNumber
                }

            if not incidentType in output["totals"]["type"]:
                output["totals"]["type"][incidentType] = 1
            else:
                output["totals"]["type"][incidentType] += 1

            if not incidentDateTime.year in output["totals"]["year"]:
                output["totals"]["year"][incidentDateTime.year] = 1
            else:
                output["totals"]["year"][incidentDateTime.year] += 1
            output["totals"]["hour"][incidentDateTime.hour] += 1
            output["totals"]["month"][
                incidentDateTime.month] += 1  # note: using 1-based months
            output["totals"]["weekday"][incidentDateTime.weekday()] += 1

            i += 1  # count distinct incidents to determine display type
        else:
            pass
#            output["incident"][incidentNumber]["unit"].append(incidentUnit)

        if not incidentUnit in output["totals"]["unit"]:
            output["totals"]["unit"][incidentUnit] = 1
        else:
            output["totals"]["unit"][incidentUnit] += 1

#        if not incidentUnit in output["totals"]["unit"]:
#            output["totals"]["unit"][incidentUnit] = 1
#        else:
#            output["totals"]["unit"][incidentUnit] += 1

#    if i > fullDataLimit:
#        output["display"] = "heatmap"

    return output
예제 #13
0
def removeIncidentFromTweet(tweetId):
    cursor = cnxnMgr.getCursor()
    cursor.execute("update tweet set incidentNumber = NULL where id = ?",
                   tweetId)
    cursor.commit()
예제 #14
0
def assignIncidentToTweet(incidentNumber, tweetId):
    cursor = cnxnMgr.getCursor()
    cursor.execute("update tweet set incidentNumber = ? where id = ?",
                   incidentNumber, tweetId)
    cursor.commit()
예제 #15
0
def findIncidentForTweet(tweet):
    cursor = cnxnMgr.getCursor()
    lower = tweet[1] + timedelta(hours=-24)
    upper = tweet[1] + timedelta(hours=1)
    for incident in cursor.execute(
            """
    select incident.number, incident.datetime, IT.raw_type, IU.unit_name, location.raw_location, location.street_number, location.street_name, location.cross_street from incident
    inner join incident_type as IT on incident.number = IT.incidentNumber
    inner join incident_location as IL on incident.number = IL.incidentNumber
    inner join incident_unit as IU on incident.number = IU.incidentNumber
    inner join location on IL.raw_location = location.raw_location
    where incident.datetime between ? and ?
    """, lower, upper):
        # if house number and street content word in tweet
        if incident[5] and len(incident[5]) > 1 and incident[5] + ' ' in tweet[
                2] and incident[6] and getFirstStreetContentWord(
                    incident[6]).lower() in tweet[2].lower().split():
            print(incident[2])
            print(incident[4])
            print(tweet[2])
            print()
            assignIncidentToTweet(incident[0], tweet[0])
            break
    # if house number lines up with block in tweet and street content word in tweet
        elif incident[5] and len(incident[5]) > 2 and re.search(
                " " + incident[5][:-2] + '00' + " ?bl?o?c?k",
                tweet[2]) and incident[6] and getFirstStreetContentWord(
                    incident[6]).lower() in tweet[2].lower().split():
            print(incident[2])
            print(incident[4])
            print(tweet[2])
            print()
            assignIncidentToTweet(incident[0], tweet[0])
            break
    # if both the street and the cross street are in the tweet text based on distinct words
        elif incident[6] and incident[7] and getFirstStreetContentWord(
                incident[6]).lower() in tweet[2].lower().split(
                ) and getFirstStreetContentWord(
                    incident[7]).lower() in tweet[2].lower().split():
            #                if incident[5] + ' ' in tweet[2]:
            print(incident[2])
            print(incident[4])
            print(tweet[2])
            print()
            assignIncidentToTweet(incident[0], tweet[0])
            break
    return
    # I'm not convinced about any of the matching approaches below
    for incident in cursor.execute(
            """
    select incident.number, incident.datetime, IT.raw_type, IU.unit_name, location.raw_location, location.street_number, location.street_name, location.cross_street from incident
    inner join incident_type as IT on incident.number = IT.incidentNumber
    inner join incident_location as IL on incident.number = IL.incidentNumber
    inner join incident_unit as IU on incident.number = IU.incidentNumber
    inner join location on IL.raw_location = location.raw_location
    where incident.datetime between ? and ?
    """, lower, upper):
        # if house number and 5 char of street name in tweet
        # if both the street and the cross street are in the tweet text based on distinct words
        if not incident[0] in seen and incident[6] and incident[
                7] and getFirstStreetContentWord(incident[6]).lower(
                ) in tweet[2].lower().split() and getFirstStreetContentWord(
                    incident[7]).lower() in tweet[2].lower().split():
            #                if incident[5] + ' ' in tweet[2]:
            print(incident[2])
            print(incident[4])
            print(tweet[2])
            print()
            seen.append(incident[0])
            assignIncidentToTweet(incident[0], tweet[0])
            break
    return
    for incident in cursor.execute(
            """
    select incident.number, incident.datetime, IT.raw_type, IU.unit_name, location.raw_location, location.street_number, location.street_name, location.cross_street from incident
    inner join incident_type as IT on incident.number = IT.incidentNumber
    inner join incident_location as IL on incident.number = IL.incidentNumber
    inner join incident_unit as IU on incident.number = IU.incidentNumber
    inner join location on IL.raw_location = location.raw_location
    where incident.datetime between ? and ?
    """, lower, upper):
        # if both the street and the cross street are in the tweet text based on longest substring
        if not incident[0] in seen and incident[6] and len(
                longest_common_substring(
                    incident[6], tweet[2])) > 4 and incident[7] and len(
                        longest_common_substring(incident[7], tweet[2])) > 4:
            #                if incident[5] + ' ' in tweet[2]:
            print(incident[2])
            print(incident[4])
            print(tweet[2])
            seen.append(incident[0])
            assignIncidentToTweet(incident[0], tweet[0])
            break
    for incident in cursor.execute(
            """
    select incident.number, incident.datetime, IT.raw_type, IU.unit_name, location.raw_location, location.street_number, location.street_name, location.cross_street from incident
    inner join incident_type as IT on incident.number = IT.incidentNumber
    inner join incident_location as IL on incident.number = IL.incidentNumber
    inner join incident_unit as IU on incident.number = IU.incidentNumber
    inner join location on IL.raw_location = location.raw_location
    where incident.datetime between ? and ?
    and IU.unit_name = 'PIO'
    """, lower, upper):
        # since this is an incident the PIO responded to then we should be biased twoard accepting it
        if not incident[0] in seen and incident[6] and len(
                longest_common_substring(incident[6], tweet[2])) > 4:
            #                if incident[5] + ' ' in tweet[2]:
            print(incident[4])
            print(tweet[2])
            seen.append(incident[0])
            #                assignIncidentToTweet(incident[0], tweet[0])
            break
    for incident in cursor.execute(
            """
    select incident.number, incident.datetime, IT.raw_type, IU.unit_name, location.raw_location, location.street_number, location.street_name, location.cross_street from incident
    inner join incident_type as IT on incident.number = IT.incidentNumber
    inner join incident_location as IL on incident.number = IL.incidentNumber
    inner join incident_unit as IU on incident.number = IU.incidentNumber
    inner join location on IL.raw_location = location.raw_location
    where incident.datetime between ? and ?
    """, lower, upper):
        # if house number and 5 char of street name in tweet
        if not incident[0] in seen and incident[5] and len(
                incident[5]) > 1 and incident[6] and len(
                    longest_common_substring(incident[6], tweet[2])) > 4:
            if incident[5] + ' ' in tweet[2]:
                print(incident[4])
                print(tweet[2])
                seen.append(incident[0])
                assignIncidentToTweet(incident[0], tweet[0])
                break
예제 #16
0
def addTweet(id, text, datetime):
    cursor = cnxnMgr.getCursor()
    cursor.execute(
        "if not exists (select id from tweet where id = ?) insert into tweet(id, text, datetime) values (?, ?, ?)",
        id, id, text, datetime)
    cursor.commit()