Ejemplo n.º 1
0
def upload_collision_data_from_socrata(docs,index,doc_type,new_mapping=False):
    #input: list of collisions documents
    #output: updates collisions index with new documents
    records = []
    for row in docs:
        row = {(' ').join(k.upper().split('_')):row[k] for k in row} #fix the field names from socrata
        #Only store data in ES if it has lat/lon data
        if 'LATITUDE' in row and 'LONGITUDE' in row and row['LATITUDE'] != "" and row['LONGITUDE'] != "":
            row["GEOJSON_C"] = (float(row["LONGITUDE"]),float(row['LATITUDE']))
            row["GEOSHAPE_C"] = { "type": "point", "coordinates": [float(row["LONGITUDE"]),float(row['LATITUDE'])]}
            row['LOCATION'] = '(%s,%s)' % (row['LOCATION']['latitude'],row['LOCATION']['longitude'])
            #convert date and time into one variable
            coll_date = parse(row['DATE']).replace(tzinfo=None)
            coll_time = parse(row['TIME']).replace(tzinfo=None)
            row["DATETIME_C"] = dt.datetime.strftime(coll_date + dt.timedelta(hours=coll_time.hour,seconds=coll_time.minute*60), "%Y-%m-%dT%H:%M:%S")
            #assign a unique id based on date, time, and location
            #row["ID"] = hashlib.sha224(dt.datetime.strftime(coll_date,"%m/%d/%Y") + dt.datetime.strftime(coll_time,"%H:%M:%S") + row["LATITUDE"] + row["LONGITUDE"]).hexdigest()

            # append "collision_" in front of each column
            #print row
            newrow = dict()
            for key, value in row.iteritems():
                newrow["collision_" + key] = value

            #add ZCTA zip code fields
            newrow = add_ZCTA.add_zcta_zip_to_collision_rec(newrow)

            #add injured/killed binary variable
            newrow['collision_injured_or_killed']=0
            inj_fatalities = (float(newrow['collision_NUMBER OF CYCLIST INJURED'])+
                             float(newrow['collision_NUMBER OF CYCLIST KILLED'])+
                             float(newrow['collision_NUMBER OF PEDESTRIANS INJURED'])+
                             float(newrow['collision_NUMBER OF PEDESTRIANS KILLED'])+
                             float(newrow['collision_NUMBER OF MOTORIST INJURED'])+
                             float(newrow['collision_NUMBER OF MOTORIST KILLED']))
            if inj_fatalities>0:
                    newrow['collision_injured_or_killed']=1
            
            records.append(newrow)	

    if new_mapping:
        #if this is a new index, use function that creates the mapping
        upload = {'index':index,'doc_type':doc_type,'id_field':'collision_UNIQUE KEY','geopoint':'collision_GEOJSON_C','geoshape':'collision_GEOSHAPE_C'}
        upload_to_Elasticsearch.bulk_upload_docs_to_ES_cURL(records,**upload)
    else:
        #update existing index
        upload = {'index':index,'doc_type':doc_type,'id_field':'collision_UNIQUE KEY'}
        upload_to_Elasticsearch.update_ES_records_curl(records,**upload) 
def add_collision_geoshape_point(index, doc_type):
    #input: index name, doc_type
    #Searches ES collisions data, updates records with gepshape mapping
    es_url = 'http://%s:%s@%s:9200' % (ES_username,ES_password,ES_url)
    es = Elasticsearch(es_url)


    try:
        mapping = {}
        mapping['properties'] = {}

        #iset the geo_shape mapping
        mapping['properties']['collision_GEOSHAPE_C'] = {'type':'geo_shape','tree':'quadtree', 'precision': '1m'}
        
        #use cURL to put the mapping
        p = subprocess.Popen(['curl','%s/%s/_mapping/%s' % (es_url,index,doc_type),'-d','%s' % json.dumps(mapping)],stderr=subprocess.PIPE)
        out, err = p.communicate()
        if err: print '\n' + err
        
    except Exception as e:
        #do not try to recreate the index
        print "Error creating index:"
        print e
        
    
    idx=0
    updates=[]
    for result in helpers.scan(es,index=index,doc_type=doc_type): 
        if 'collision_GEOJSON_C' in result['_source']:
            idx+=1
            _id = result['_id']
            #make copy of geopoint field as a geoshape point
            result['_source']['collision_GEOSHAPE_C'] = {
                                                            "type":"point",
                                                             "coordinates": result['_source']['collision_GEOJSON_C'] 
                                                        }

            updates.append(result['_source'])

        if idx >= 10000:
            upload_to_Elasticsearch.update_ES_records_curl(updates,index=index,doc_type=doc_type,id_field='collision_UNIQUE KEY')
            idx=0
            updates=[]
        
    #upload the remaining records
    upload_to_Elasticsearch.update_ES_records_curl(updates,index=index,doc_type=doc_type,id_field='collision_UNIQUE KEY')
Ejemplo n.º 3
0
def upload_collision_data_from_flatfile(docs,index,doc_type,new_mapping=False):
    #input: list of collisions documents
    #output: updates collisions index with new documents
    records = []
    for row in docs:
        #Only store data in ES if it has lat/lon data
        if row['LATITUDE'] != "" and row['LONGITUDE'] != "":
            row["GEOJSON_C"] = (float(row["LONGITUDE"]),float(row['LATITUDE']))
            row["GEOSHAPE_C"] = { "type": "point", "coordinates": [float(row["LONGITUDE"]),float(row['LATITUDE'])] }
            #convert date and time into one variable
            coll_date = parse(row['DATE']).replace(tzinfo=None)
            coll_time = parse(row['TIME']).replace(tzinfo=None)
            row["DATETIME_C"] = dt.datetime.strftime(coll_date + dt.timedelta(hours=coll_time.hour,seconds=coll_time.minute*60), "%Y-%m-%dT%H:%M:%S")

            # append "collision_" in front of each column
            #print row
            newrow = dict()
            for key, value in row.iteritems():
                    newrow["collision_" + key] = value

            #add ZCTA zip code fields
            #newrow = add_ZCTA.add_zcta_zip_to_collision_rec(newrow)

            #add injured/killed binary variable
            newrow['collision_injured_or_killed']=0
            inj_fatalities = (float(newrow['collision_NUMBER OF CYCLIST INJURED'])+
                             float(newrow['collision_NUMBER OF CYCLIST KILLED'])+
                             float(newrow['collision_NUMBER OF PEDESTRIANS INJURED'])+
                             float(newrow['collision_NUMBER OF PEDESTRIANS KILLED'])+
                             float(newrow['collision_NUMBER OF MOTORIST INJURED'])+
                             float(newrow['collision_NUMBER OF MOTORIST KILLED']))
            if inj_fatalities>0:
                    newrow['collision_injured_or_killed']=1
            
            records.append(newrow)	

    if new_mapping:
        #if this is a new index, use function that creates the mapping
        upload = {'index':index,'doc_type':doc_type,'id_field':'collision_UNIQUE KEY','geopoint':'collision_GEOJSON_C','geoshape':'collision_GEOSHAPE_C'}
        upload_to_Elasticsearch.bulk_upload_docs_to_ES_cURL(records,**upload)
    else:
        #update existing index
        upload = {'index':index,'doc_type':doc_type,'id_field':'collision_UNIQUE KEY'}
        upload_to_Elasticsearch.update_ES_records_curl(records,**upload) 
Ejemplo n.º 4
0
def upload_open_data_to_Elasticsearch(url,
                                      endpoint,
                                      api_key,
                                      query=None,
                                      kwargs={}):
    #input: Socrata url, endpoint, API key, OPTIONAL query, and ES bulk upload kwargs
    #output: uploads data to ES index
    client = Socrata(url, api_key)
    idx = 0
    time.sleep(5)  #sleep 20 seconds, to allow time to connect
    docs = client.get(endpoint, limit=10000, offset=0, where=query)
    upload_to_Elasticsearch.bulk_upload_docs_to_ES_cURL(docs, **kwargs)
    #time.sleep(20)#sleep 20 seconds, to allow time to connect
    while len(docs) > 0:
        #page through the results, appending to the out list
        idx += 10000
        docs = client.get(endpoint, limit=10000, offset=idx, where=query)
        upload_to_Elasticsearch.update_ES_records_curl(docs, **kwargs)
    client.close()
Ejemplo n.º 5
0
def update_zip_code_level_data(index, doc_type, p=Proj(init='epsg:2263')):
    #updates the zip code collection with zip-specific data
    es_url = 'http://%s:%s@%s:9200' % (ES_username, ES_password, ES_url)
    es = Elasticsearch(es_url)
    to_update = []
    for doc in helpers.scan(
            es, index=index, doc_type=doc_type
    ):  #iterate through all the streets in the zip code
        zipcode = doc['_source']
        zipcode['area'] = Polygon([
            p(lng, lat) for lng, lat in zipcode['coords']['coordinates'][0]
        ]).area / (1609.34**2)
        zipcode.update(zip_street_data(zipcode['zipcode'].split('-')[0], p))
        zipcode.update(zip_traffic_data(zipcode['zipcode']))
        to_update.append(zipcode)
        kwargs = {
            "index": "nyc_zip_codes",
            "doc_type": "zip_codes",
            "id_field": "zipcode"
        }
    upload_to_Elasticsearch.update_ES_records_curl(to_update, **kwargs)
# ### Write predictions to Elasticsearch
#
# Write the RF predictions back to the SafeRoad results Elasticsearch index.

predictions.saveAsNewAPIHadoopFile(
    path='-',
    outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
    keyClass="org.apache.hadoop.io.NullWritable",
    valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
    conf=pred_write_conf)

#convert ratings from numpy float to python native float, convert to tuple for loading into ES
featuresRDD = featuresRDD.map(lambda f: (f['id'], clean_feat_rank(f)))

featuresRDD.saveAsNewAPIHadoopFile(
    path='-',
    outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
    keyClass="org.apache.hadoop.io.NullWritable",
    valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
    conf=feat_write_conf)

#
# ### Write Model diagnostics to Elasticsearch
#
index, doc_type = diag.split('/')
diagnostics.update({'id': '1'})
upload_to_Elasticsearch.update_ES_records_curl([diagnostics],
                                               index=index,
                                               doc_type=doc_type,
                                               id_field="id")
Ejemplo n.º 7
0
def add_zcta_zip_to_index(index, doc_type, loc_field, id_field, prefix=None):
    #iterates through zcta zip code polygons, updates index records with zip code id they are contained within
    #if no lat/lng adds 'NA' to field
    #input: index name, doc_type
    #Searches ES collisions data, updates records with gepshape mapping
    es_url = 'http://%s:%s@%s:9200' % (ES_username, ES_password, ES_url)
    es = Elasticsearch(es_url)

    proj = Proj(init='epsg:2263')  #NY/Long Island UTM projection

    if prefix:
        zip_field1 = prefix + '_ZCTA_ZIP'
        zip_field2 = prefix + '_ZCTA_ZIP_NoSuffix'
    else:
        zip_field1 = 'ZCTA_ZIP'
        zip_field2 = 'ZCTA_ZIP_NoSuffix'

    try:
        mapping = {}
        mapping['properties'] = {}

        #set the ZCTA zip fieldmapping
        mapping['properties'][zip_field1] = {'type': 'string'}
        mapping['properties'][zip_field2] = {'type': 'string'}

        #use cURL to put the mapping
        p = subprocess.Popen([
            'curl',
            '%s/%s/_mapping/%s' % (es_url, index, doc_type), '-d',
            '%s' % json.dumps(mapping)
        ],
                             stderr=subprocess.PIPE)
        out, err = p.communicate()
        if err: print '\n' + err

    except Exception as e:
        #do not try to recreate the index
        print "Error creating index:"
        print e

    idx = 0
    updates = []
    for result in helpers.scan(es, index=index, doc_type=doc_type):
        idx += 1
        _id = result['_id']
        #Add placeholder for ZCTA zip code
        result['_source'][zip_field1] = 'NA'
        result['_source'][zip_field2] = 'NA'

        if loc_field in result['_source']:
            query = '''{
                        "query":{
                                "bool":{
                                        "must":{"match_all": {}},
                                        "filter":{
                                                "geo_shape":{
                                                        "coords":{
                                                                "indexed_shape": {
                                                                        "index": "%s",
                                                                        "type": "%s",
                                                                        "id": "%s",
                                                                        "path": "%s"
                                                                        },
                                                                "relation": "intersects"
                                                                }
                                                        }
                                                }
                                        }
                                }
                        }''' % (index, doc_type, _id, loc_field)
            max_area = 0
            max_zip = False
            #query the zip codes, finding all zip shapes that contain the current colision
            for shape in helpers.scan(es,
                                      query=query,
                                      index='nyc_zip_codes',
                                      doc_type='zip_codes'):
                coords = [
                    proj(lng, lat) for lng, lat in shape['_source']['coords']
                    ['coordinates'][0]
                ]
                poly = Polygon(coords)
                if poly.area > max_area:
                    #get the largest zip code by geographic area
                    max_area = poly.area
                    max_zip = shape['_id']
            if max_zip:
                result['_source'][zip_field1] = max_zip
                result['_source'][zip_field2] = max_zip.split('-')[0]
        updates.append(result['_source'])

        if idx >= 10000:
            upload_to_Elasticsearch.update_ES_records_curl(updates,
                                                           index=index,
                                                           doc_type=doc_type,
                                                           id_field=id_field)
            idx = 0
            updates = []

    #upload the remaining records
    upload_to_Elasticsearch.update_ES_records_curl(updates,
                                                   index=index,
                                                   doc_type=doc_type,
                                                   id_field=id_field)
Ejemplo n.º 8
0
def find_closest(index1,
                 doc_type1,
                 geo_field1,
                 id_field1,
                 index2,
                 doc_type2,
                 geo_field2,
                 proj=None):
    #input: 2 indexes, doc_types, and geospatial fields
    #output: updates index 1 with record of closest item from index2

    ##O(N^2) RUNTIME. ONLY RUN ON SMALL INDEXES!!

    es_url = 'http://%s:%s@%s:9200' % (ES_username, ES_password, ES_url)
    es = Elasticsearch(es_url)

    mapping1 = es.indices.get_field_mapping(index=index1,
                                            doc_type=doc_type1,
                                            fields=geo_field1)
    type1 = mapping1[index1]['mappings'][doc_type1][geo_field1]['mapping'][
        geo_field1]['type']
    #print type1

    mapping2 = es.indices.get_field_mapping(index=index2,
                                            doc_type=doc_type2,
                                            fields=geo_field2)
    type2 = mapping2[index2]['mappings'][doc_type2][geo_field2]['mapping'][
        geo_field2]['type']
    #print type2

    updates = []
    idx = 0
    for res1 in helpers.scan(es, index=index1, doc_type=doc_type1):
        _id = res1['_id']
        if type1 == 'geo_point':
            if proj:
                poly1 = Point(
                    proj(res1['_source'][geo_field1][0],
                         res1['_source'][geo_field1][1]))
            else:
                poly1 = Point(res1['_source'][geo_field1])
        else:
            if res1['_source'][geo_field1]['type'].lower() == 'point':
                if proj:
                    poly1 = Point(
                        proj(res1['_source'][geo_field1]['coordinates'][0],
                             res1['_source'][geo_field1]['coordinates'][1]))
                else:
                    poly1 = Point(res1['_source'][geo_field1]['coordinates'])
            else:
                if proj:
                    coords = [
                        proj(lng, lat) for lng, lat in res1['_source']
                        [geo_field1]['coordinates'][0]
                    ]
                    poly1 = Polygon(coords)
                else:
                    poly1 = Polygon(
                        res1['_source'][geo_field1]['coordinates'][0])

        idx += 1
        min_dist = float('inf')
        closest = None
        for res2 in list(helpers.scan(es, index=index2,
                                      doc_type=doc_type2))[:5]:
            if type2 == 'geo_point':
                if proj:
                    poly2 = Point(
                        proj(res2['_source'][geo_field2][0],
                             res2['_source'][geo_field2][1]))
                else:
                    poly2 = Point(res2['_source'][geo_field2])
            else:
                if res2['_source'][geo_field2]['type'].lower() == 'point':
                    if proj:
                        poly2 = Point(
                            proj(
                                res2['_source'][geo_field2]['coordinates'][0],
                                res2['_source'][geo_field2]['coordinates'][1]))
                    else:
                        poly2 = Point(
                            res2['_source'][geo_field2]['coordinates'])
                else:
                    if proj:
                        coords = [
                            proj(lng, lat) for lng, lat in res2['_source']
                            [geo_field2]['coordinates'][0]
                        ]
                        poly2 = Polygon(coords)
                    else:
                        poly2 = Polygon(
                            res2['_source'][geo_field2]['coordinates'][0])

            c1, c2 = poly1.centroid.coords[0], poly2.centroid.coords[0]
            dist = math.sqrt((c1[0] - c2[0])**2 + (c1[1] - c2[1])**2)
            if dist < min_dist:
                min_dist = dist
                closest = res2['_id']

        new_doc = deepcopy(res1['_source'])
        new_doc['closest_%s' % doc_type2] = closest
        updates.append(new_doc)
        if idx >= 10000:
            upload_to_Elasticsearch.update_ES_records_curl(updates,
                                                           index=index1,
                                                           doc_type=doc_type1,
                                                           id_field=id_field1)
            idx = 0
            updates = []
    #upload remaining records
    upload_to_Elasticsearch.update_ES_records_curl(updates,
                                                   index=index1,
                                                   doc_type=doc_type1,
                                                   id_field=id_field1)