Exemple #1
0
def list_crimes(request,issue_number):

    #
    # get list of crimes
    crime_list = CrimeReport.objects.filter(policelog__issue_number__exact=issue_number)

    if (request.method != 'POST'):
        #
        # method will be 'GET' if we didn't get here via form submit
        pass
    else:
        k=request.POST.keys()
        my_logger.debug(k)
        if 'geocode' in k:
            my_logger.debug('geocode pushed')
            
            #
            # geocode the entries in the list
            geocoder=MvGeocoder()
            for index in range(0,len(crime_list)):
                crime=crime_list[index]
                (lat,long)=geocoder.geocode(crime.address,crime.map_scale)
                crime_list[index].lat=lat
                crime_list[index].long=long
            

    template=loader.get_template('mvvscrape/list_crimes.html')
    context = Context({'crime_list': crime_list,'issue_number':issue_number})
    return HttpResponse(template.render(context))
Exemple #2
0
    def __init__(self,source_dir=DOWNLOAD_DIR,geocoder='MV'):
        #
        # location of html files 
        self.source_dir=source_dir

        #
        # set up geocoder object
        # only the 'mountain view' geocoder is available right now
        if geocoder=='MV':
            self.geocoder=MvGeocoder()
        else:
            raise ValueError
Exemple #3
0
    def test_geocode_addr_38_1(self):
        """
        test geocoding of problem address from issue 38"""

        crime_report = CrimeReport()
        crime_report.address = '1200 block Cuernavaca Cl.'  # circulo
        crime_report.map_scale = mapscale.BLOCK

        geocoder = MvGeocoder()
        (lat, long) = geocoder.geocode(crime_report.address,
                                       crime_report.map_scale)
        self.assert_(lat == '37.3716608')
        self.assert_(long == '-122.0640654')
Exemple #4
0
    def test_geocode_block_address(self):
        """
        try to geocode a known address"""
        crime_report = CrimeReport()
        crime_report.address = '300 block mountain view ave.'
        crime_report.map_scale = mapscale.BLOCK

        geocoder = MvGeocoder()
        (lat, long) = geocoder.geocode(crime_report.address,
                                       crime_report.map_scale)

        self.assert_(lat == '37.3947668')
        self.assert_(long == '-122.0853018')
Exemple #5
0
    def test_geocode_bad_address(self):
        """
        bad addresses should be geocoded with invalid lat,long values"""

        crime_report = CrimeReport()
        crime_report.address = 'publix'  # ain't no publix in MV, dumbass
        crime_report.map_scale = mapscale.BLOCK

        geocoder = MvGeocoder()
        (lat, long) = geocoder.geocode(crime_report.address,
                                       crime_report.map_scale)

        self.assert_(lat == str(geocoder.INVALID_LAT))
        self.assert_(long == str(geocoder.INVALID_LONG))
Exemple #6
0
    def test_geocode_addr_38_2(self):
        """
        test geocoding of problem address from issue 38

        800 block el camino real:
        geocoding fails in this case because the address doesnt specify east or west ECR
        proper response is to return INVALID_LAT,INVALID_LONG"""

        crime_report = CrimeReport()
        crime_report.address = '800 block el camino real'
        crime_report.map_scale = mapscale.BLOCK

        geocoder = MvGeocoder()
        (lat, long) = geocoder.geocode(crime_report.address,
                                       crime_report.map_scale)
        self.assert_(lat == str(geocoder.INVALID_LAT))
        self.assert_(long == str(geocoder.INVALID_LONG))
Exemple #7
0
class PoliceLogParser(object):
    def __init__(self,source_dir=DOWNLOAD_DIR,geocoder='MV'):
        #
        # location of html files 
        self.source_dir=source_dir

        #
        # set up geocoder object
        # only the 'mountain view' geocoder is available right now
        if geocoder=='MV':
            self.geocoder=MvGeocoder()
        else:
            raise ValueError


    def extract_crime_category(self,match):
        """extract crime category (see common.CRIME_DICT) from text 
        matching CATEGORY_REGEXP

        IN
        match: match object from re.search

        RETURNS
        corresponding dict value from common.CRIME_DICT"""
    

        #
        # wouldn't be calling this function if we didn't already know there's a match
        assert(match!=None)

        #
        # extract crime category
        line=match.string
        start_index=match.start('crime')
        stop_index=match.end('crime')
        crime_key=line[start_index:stop_index]
        crime_key=crime_key.lower()

        my_logger.debug('match(%d,%d)=%s' % (start_index,stop_index,crime_key))
        
        return crime_key

    def new_police_report(self):
        """
        return new dictionary object with relevant fields 
        extracted from police report"""

        d = {'category':'',
             'original_text':'',
             'line_num':0,
             'address':'',
             'map_scale':mapscale.UNKNOWN,
             'date_month':0,
             'date_day':0,
             'date_year':0,
             'lat':'',
             'long':''}

        return d

    def parse_report_line(self,line):
        """
        Parse report entry, extract relevant fields.
        Return a dictionary (see new_police_report)
    
        examples:
        BLOCK
        <p class=story_text>200 block Rock St., 1/29<P></p>
        <p class=story_text>1900 block Rock St., 1/29 (recovered)<P></p>
        
        EXACT
        <p class=story_text>2500 West Middlefield Rd., 1/28<P></p>
        
        
        INTERSECTION
        <p class=story_text>Crittenden Lane/N. Shoreline Blvd., 1/30<P></p>
        
        ESTABLISHMENT
        <p class=story_text>McDonald's, El Monte, 1/29<P></p>
        <p class=story_text>Target, 1/26<P></p>
        <p class=story_text>Safeway, North Shoreline Blvd., 1/25 <P></p>
        
        OTHER
        <p class=story_text>Downtown Mountain View, 1/23<P></p>
        """

        report = self.new_police_report()
        report['original_text'] = line
        
        #
        # extract month and day
        match_date = REPORT_DATE_REGEXP.search(line)
        assert(match_date)
        start_index=match_date.start('month')
        stop_index=match_date.end('month')
        report['date_month'] = int(line[start_index:stop_index])

        start_index=match_date.start('day')
        stop_index=match_date.end('day')
        report['date_day'] = int(line[start_index:stop_index])

        my_logger.debug('extracted date (%d/%d)' % (report['date_month'],report['date_day']))

        #############################################
        # extract location & scale
        line = line[0:match_date.start('month')-1]   # truncate after start of date
        
        #
        # trim off preceding html and trailing comma
        start_index=line.rfind('>')+1
        assert(start_index>0)

        stop_index=line.rfind(',',start_index)
    
        if stop_index >= 2:
            #
            # found a comma, 
            line = line[start_index:stop_index]
        else:
            #
            # no comma found
            line = line[start_index:]
        my_logger.debug('truncated string: (%s)' % line)
        report['address']=line
        #
        # try to determine which case:
        # a block
        # an exact address
        # an establishment
        # an intersection
        # special cases, like: "downtown mountain view"
        # 

        if (BLOCK_REGEXP.match(line)!=None):
            my_logger.debug('BLOCK detected')
            report['map_scale']=mapscale.BLOCK
        elif (INTERSECTION_REGEXP.match(line)!=None):
            my_logger.debug('INTERSECTION detected')
            report['map_scale']=mapscale.INTERSECTION
        elif (EXACT_REGEXP.match(line)!=None):
            my_logger.debug('EXACT detected')
            report['map_scale']=mapscale.EXACT
        else:
            #
            # must be manually assigned
            report['map_scale']=mapscale.OTHER


        return report
    

    def parse_log(self,filename,log_year):
        """
        Given filename of HTML police log, parse it and return list of police reports
        
        In:
        filename - filename of police log
        log_year - year of report

        
        Out:
        List of police report dictionaries (see new_police_report)"""

    
        download_filename=os.sep.join([self.source_dir,filename])
        my_logger.debug("parsing log file: %s" % download_filename)
        try:
            f = open(download_filename,mode='rt')
        except IOError:
            my_logger.debug( "can't open file %s" % download_filename)
            return

        #
        # return list of report objects
        L=[]

        #
        # parse & extract fields into new report object
        # parse to determine exact category
        # parse to determine geoscope
        state = STATE_INIT
        new_state = STATE_INIT
        current_crime_category=None
        line_index = 0
        previous_report_index=0
        for line in f:
            line_index=line_index+1
            #
            # state machine:
            # transition from init -> find_category 
            # transition from find_category to find_report after finding first category

            if state==STATE_INIT:
                new_state = STATE_FIND_CATEGORY

            elif state==STATE_FIND_CATEGORY:
                #
                # find first instance of crime category heading
                match_crime_header = CATEGORY_REGEXP.search(line)
                match_report=REPORT_DATE_REGEXP.search(line)
            
                if match_crime_header and (match_report==None):
                    #
                    # found crime header
                    my_logger.debug("========== TRANSITION TO FIND_REPORT\n")
                    my_logger.debug('%d %s' % (line_index,line))
                    new_state = STATE_FIND_REPORT

                    #
                    # remember where this category occurred
                    category_line_index=line_index

                    current_crime_category = self.extract_crime_category(match_crime_header)
            
                elif match_crime_header and match_report:
                    #
                    # error: both detectors triggered by this line
                    my_logger.debug('match_crime_header and match_report triggered by (%s)' % line)
                    raise ValueError
                elif (match_crime_header==None) and (match_report):
                    #
                    # error: found report line before first category
                    my_logger.debug("found report prematurely in (%s)\n" % line)
                    raise ValueError
                else:
                    #
                    # neither crime header nor crime report, so ignore it
                    pass

            elif state==STATE_FIND_REPORT:
                my_logger.debug('%d %s' % (line_index,line[0:-1])) # -1 to avoid extra LF
        
                #
                # sanity check:
                # "run" of valid reports is too long
                if (category_line_index-line_index) > 20:
                    my_logger.debug("run of reports too long: skipped category?")
                    raise ValueError

                match_crime_header = CATEGORY_REGEXP.search(line)
                match_report=REPORT_DATE_REGEXP.search(line)

                if match_crime_header and (match_report==None):
                    #
                    # came across new crime category
                    current_crime_category = self.extract_crime_category(match_crime_header)
                    new_state = STATE_FIND_REPORT

                    category_line_index=line_index

                elif (match_crime_header==None) and match_report:
                    #
                    # found report
                    new_state = STATE_FIND_REPORT

                    report=self.parse_report_line(line)
                    report['category']=current_crime_category
                    report['line_num']=line_index
                    report['date_year']=log_year
                    L.append(report)

                    #
                    # sanity check
                    # reports should be <= 2 lines apart
                    if (line_index - max([category_line_index,previous_report_index])) > 2:
                        my_logger.debug('WARNING: possible skipped report')
                        my_logger.debug('current line: %d' % line_index)
                        my_logger.debug('last report or category: %d' %
                                        max([category_line_index,previous_report_index]))

                    # remember this line index
                    previous_report_index=line_index

                else:
                    #
                    # neither regexp matched, so ignore it
                    pass

            state=new_state

        f.close()
        return L


    def parse_log_and_populate_db(self,start_issue,stop_issue):
        """
        if issue_num doesn't exist, then skip to next"""
        
        for issue_num in range(start_issue,stop_issue+1):
            try:
                police_log=PoliceLog.objects.get(issue_number__exact=issue_num)
            except PoliceLog.DoesNotExist:
                #
                # if issue doesn't exist in db, then go to next issue
                pass
            else:
                if len(police_log.filename)>0:
                    #
                    # in order to parse log file, must have filename
                    L=self.parse_log(police_log.filename,
                                police_log.pub_date.year)

                else:
                    L=[]
    
                #
                # add each report to db
                for report in L:
                    #
                    # hash string is digest of (issue_number, crime category, original text)
                    # this should ensure a unique hash
                    hasher = hashlib.md5()
                    hasher.update(str(police_log.issue_number))
                    hasher.update(report['category'])
                    hasher.update(report['original_text'])

                    crime=CrimeReport(hash=hasher.hexdigest(),
                                      policelog=police_log,  # foreign key: specify police log object
                                      category=report['category'],
                                      original_text=report['original_text'],
                                      line_num=report['line_num'],
                                      address=report['address'],
                                      map_scale=report['map_scale'],
                                      date=datetime.date(report['date_year'],
                                                         report['date_month'],
                                                         report['date_day']))
                

                    # add lat-long coordinates to crime report
                    (lat,long)=self.geocoder.geocode(crime.address,crime.map_scale)
                    crime.lat=lat
                    crime.long=long

                    crime.save()