def upload_collision_data_from_socrata(docs,index,doc_type,new_mapping=False): #input: list of collisions documents #output: updates collisions index with new documents records = [] for row in docs: row = {(' ').join(k.upper().split('_')):row[k] for k in row} #fix the field names from socrata #Only store data in ES if it has lat/lon data if 'LATITUDE' in row and 'LONGITUDE' in row and row['LATITUDE'] != "" and row['LONGITUDE'] != "": row["GEOJSON_C"] = (float(row["LONGITUDE"]),float(row['LATITUDE'])) row["GEOSHAPE_C"] = { "type": "point", "coordinates": [float(row["LONGITUDE"]),float(row['LATITUDE'])]} row['LOCATION'] = '(%s,%s)' % (row['LOCATION']['latitude'],row['LOCATION']['longitude']) #convert date and time into one variable coll_date = parse(row['DATE']).replace(tzinfo=None) coll_time = parse(row['TIME']).replace(tzinfo=None) row["DATETIME_C"] = dt.datetime.strftime(coll_date + dt.timedelta(hours=coll_time.hour,seconds=coll_time.minute*60), "%Y-%m-%dT%H:%M:%S") #assign a unique id based on date, time, and location #row["ID"] = hashlib.sha224(dt.datetime.strftime(coll_date,"%m/%d/%Y") + dt.datetime.strftime(coll_time,"%H:%M:%S") + row["LATITUDE"] + row["LONGITUDE"]).hexdigest() # append "collision_" in front of each column #print row newrow = dict() for key, value in row.iteritems(): newrow["collision_" + key] = value #add ZCTA zip code fields newrow = add_ZCTA.add_zcta_zip_to_collision_rec(newrow) #add injured/killed binary variable newrow['collision_injured_or_killed']=0 inj_fatalities = (float(newrow['collision_NUMBER OF CYCLIST INJURED'])+ float(newrow['collision_NUMBER OF CYCLIST KILLED'])+ float(newrow['collision_NUMBER OF PEDESTRIANS INJURED'])+ float(newrow['collision_NUMBER OF PEDESTRIANS KILLED'])+ float(newrow['collision_NUMBER OF MOTORIST INJURED'])+ float(newrow['collision_NUMBER OF MOTORIST KILLED'])) if inj_fatalities>0: newrow['collision_injured_or_killed']=1 records.append(newrow) if new_mapping: #if this is a new index, use function that creates the mapping upload = {'index':index,'doc_type':doc_type,'id_field':'collision_UNIQUE KEY','geopoint':'collision_GEOJSON_C','geoshape':'collision_GEOSHAPE_C'} upload_to_Elasticsearch.bulk_upload_docs_to_ES_cURL(records,**upload) else: #update existing index upload = {'index':index,'doc_type':doc_type,'id_field':'collision_UNIQUE KEY'} upload_to_Elasticsearch.update_ES_records_curl(records,**upload)
def add_collision_geoshape_point(index, doc_type): #input: index name, doc_type #Searches ES collisions data, updates records with gepshape mapping es_url = 'http://%s:%s@%s:9200' % (ES_username,ES_password,ES_url) es = Elasticsearch(es_url) try: mapping = {} mapping['properties'] = {} #iset the geo_shape mapping mapping['properties']['collision_GEOSHAPE_C'] = {'type':'geo_shape','tree':'quadtree', 'precision': '1m'} #use cURL to put the mapping p = subprocess.Popen(['curl','%s/%s/_mapping/%s' % (es_url,index,doc_type),'-d','%s' % json.dumps(mapping)],stderr=subprocess.PIPE) out, err = p.communicate() if err: print '\n' + err except Exception as e: #do not try to recreate the index print "Error creating index:" print e idx=0 updates=[] for result in helpers.scan(es,index=index,doc_type=doc_type): if 'collision_GEOJSON_C' in result['_source']: idx+=1 _id = result['_id'] #make copy of geopoint field as a geoshape point result['_source']['collision_GEOSHAPE_C'] = { "type":"point", "coordinates": result['_source']['collision_GEOJSON_C'] } updates.append(result['_source']) if idx >= 10000: upload_to_Elasticsearch.update_ES_records_curl(updates,index=index,doc_type=doc_type,id_field='collision_UNIQUE KEY') idx=0 updates=[] #upload the remaining records upload_to_Elasticsearch.update_ES_records_curl(updates,index=index,doc_type=doc_type,id_field='collision_UNIQUE KEY')
def upload_collision_data_from_flatfile(docs,index,doc_type,new_mapping=False): #input: list of collisions documents #output: updates collisions index with new documents records = [] for row in docs: #Only store data in ES if it has lat/lon data if row['LATITUDE'] != "" and row['LONGITUDE'] != "": row["GEOJSON_C"] = (float(row["LONGITUDE"]),float(row['LATITUDE'])) row["GEOSHAPE_C"] = { "type": "point", "coordinates": [float(row["LONGITUDE"]),float(row['LATITUDE'])] } #convert date and time into one variable coll_date = parse(row['DATE']).replace(tzinfo=None) coll_time = parse(row['TIME']).replace(tzinfo=None) row["DATETIME_C"] = dt.datetime.strftime(coll_date + dt.timedelta(hours=coll_time.hour,seconds=coll_time.minute*60), "%Y-%m-%dT%H:%M:%S") # append "collision_" in front of each column #print row newrow = dict() for key, value in row.iteritems(): newrow["collision_" + key] = value #add ZCTA zip code fields #newrow = add_ZCTA.add_zcta_zip_to_collision_rec(newrow) #add injured/killed binary variable newrow['collision_injured_or_killed']=0 inj_fatalities = (float(newrow['collision_NUMBER OF CYCLIST INJURED'])+ float(newrow['collision_NUMBER OF CYCLIST KILLED'])+ float(newrow['collision_NUMBER OF PEDESTRIANS INJURED'])+ float(newrow['collision_NUMBER OF PEDESTRIANS KILLED'])+ float(newrow['collision_NUMBER OF MOTORIST INJURED'])+ float(newrow['collision_NUMBER OF MOTORIST KILLED'])) if inj_fatalities>0: newrow['collision_injured_or_killed']=1 records.append(newrow) if new_mapping: #if this is a new index, use function that creates the mapping upload = {'index':index,'doc_type':doc_type,'id_field':'collision_UNIQUE KEY','geopoint':'collision_GEOJSON_C','geoshape':'collision_GEOSHAPE_C'} upload_to_Elasticsearch.bulk_upload_docs_to_ES_cURL(records,**upload) else: #update existing index upload = {'index':index,'doc_type':doc_type,'id_field':'collision_UNIQUE KEY'} upload_to_Elasticsearch.update_ES_records_curl(records,**upload)
def upload_open_data_to_Elasticsearch(url, endpoint, api_key, query=None, kwargs={}): #input: Socrata url, endpoint, API key, OPTIONAL query, and ES bulk upload kwargs #output: uploads data to ES index client = Socrata(url, api_key) idx = 0 time.sleep(5) #sleep 20 seconds, to allow time to connect docs = client.get(endpoint, limit=10000, offset=0, where=query) upload_to_Elasticsearch.bulk_upload_docs_to_ES_cURL(docs, **kwargs) #time.sleep(20)#sleep 20 seconds, to allow time to connect while len(docs) > 0: #page through the results, appending to the out list idx += 10000 docs = client.get(endpoint, limit=10000, offset=idx, where=query) upload_to_Elasticsearch.update_ES_records_curl(docs, **kwargs) client.close()
def update_zip_code_level_data(index, doc_type, p=Proj(init='epsg:2263')): #updates the zip code collection with zip-specific data es_url = 'http://%s:%s@%s:9200' % (ES_username, ES_password, ES_url) es = Elasticsearch(es_url) to_update = [] for doc in helpers.scan( es, index=index, doc_type=doc_type ): #iterate through all the streets in the zip code zipcode = doc['_source'] zipcode['area'] = Polygon([ p(lng, lat) for lng, lat in zipcode['coords']['coordinates'][0] ]).area / (1609.34**2) zipcode.update(zip_street_data(zipcode['zipcode'].split('-')[0], p)) zipcode.update(zip_traffic_data(zipcode['zipcode'])) to_update.append(zipcode) kwargs = { "index": "nyc_zip_codes", "doc_type": "zip_codes", "id_field": "zipcode" } upload_to_Elasticsearch.update_ES_records_curl(to_update, **kwargs)
# ### Write predictions to Elasticsearch # # Write the RF predictions back to the SafeRoad results Elasticsearch index. predictions.saveAsNewAPIHadoopFile( path='-', outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=pred_write_conf) #convert ratings from numpy float to python native float, convert to tuple for loading into ES featuresRDD = featuresRDD.map(lambda f: (f['id'], clean_feat_rank(f))) featuresRDD.saveAsNewAPIHadoopFile( path='-', outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=feat_write_conf) # # ### Write Model diagnostics to Elasticsearch # index, doc_type = diag.split('/') diagnostics.update({'id': '1'}) upload_to_Elasticsearch.update_ES_records_curl([diagnostics], index=index, doc_type=doc_type, id_field="id")
def add_zcta_zip_to_index(index, doc_type, loc_field, id_field, prefix=None): #iterates through zcta zip code polygons, updates index records with zip code id they are contained within #if no lat/lng adds 'NA' to field #input: index name, doc_type #Searches ES collisions data, updates records with gepshape mapping es_url = 'http://%s:%s@%s:9200' % (ES_username, ES_password, ES_url) es = Elasticsearch(es_url) proj = Proj(init='epsg:2263') #NY/Long Island UTM projection if prefix: zip_field1 = prefix + '_ZCTA_ZIP' zip_field2 = prefix + '_ZCTA_ZIP_NoSuffix' else: zip_field1 = 'ZCTA_ZIP' zip_field2 = 'ZCTA_ZIP_NoSuffix' try: mapping = {} mapping['properties'] = {} #set the ZCTA zip fieldmapping mapping['properties'][zip_field1] = {'type': 'string'} mapping['properties'][zip_field2] = {'type': 'string'} #use cURL to put the mapping p = subprocess.Popen([ 'curl', '%s/%s/_mapping/%s' % (es_url, index, doc_type), '-d', '%s' % json.dumps(mapping) ], stderr=subprocess.PIPE) out, err = p.communicate() if err: print '\n' + err except Exception as e: #do not try to recreate the index print "Error creating index:" print e idx = 0 updates = [] for result in helpers.scan(es, index=index, doc_type=doc_type): idx += 1 _id = result['_id'] #Add placeholder for ZCTA zip code result['_source'][zip_field1] = 'NA' result['_source'][zip_field2] = 'NA' if loc_field in result['_source']: query = '''{ "query":{ "bool":{ "must":{"match_all": {}}, "filter":{ "geo_shape":{ "coords":{ "indexed_shape": { "index": "%s", "type": "%s", "id": "%s", "path": "%s" }, "relation": "intersects" } } } } } }''' % (index, doc_type, _id, loc_field) max_area = 0 max_zip = False #query the zip codes, finding all zip shapes that contain the current colision for shape in helpers.scan(es, query=query, index='nyc_zip_codes', doc_type='zip_codes'): coords = [ proj(lng, lat) for lng, lat in shape['_source']['coords'] ['coordinates'][0] ] poly = Polygon(coords) if poly.area > max_area: #get the largest zip code by geographic area max_area = poly.area max_zip = shape['_id'] if max_zip: result['_source'][zip_field1] = max_zip result['_source'][zip_field2] = max_zip.split('-')[0] updates.append(result['_source']) if idx >= 10000: upload_to_Elasticsearch.update_ES_records_curl(updates, index=index, doc_type=doc_type, id_field=id_field) idx = 0 updates = [] #upload the remaining records upload_to_Elasticsearch.update_ES_records_curl(updates, index=index, doc_type=doc_type, id_field=id_field)
def find_closest(index1, doc_type1, geo_field1, id_field1, index2, doc_type2, geo_field2, proj=None): #input: 2 indexes, doc_types, and geospatial fields #output: updates index 1 with record of closest item from index2 ##O(N^2) RUNTIME. ONLY RUN ON SMALL INDEXES!! es_url = 'http://%s:%s@%s:9200' % (ES_username, ES_password, ES_url) es = Elasticsearch(es_url) mapping1 = es.indices.get_field_mapping(index=index1, doc_type=doc_type1, fields=geo_field1) type1 = mapping1[index1]['mappings'][doc_type1][geo_field1]['mapping'][ geo_field1]['type'] #print type1 mapping2 = es.indices.get_field_mapping(index=index2, doc_type=doc_type2, fields=geo_field2) type2 = mapping2[index2]['mappings'][doc_type2][geo_field2]['mapping'][ geo_field2]['type'] #print type2 updates = [] idx = 0 for res1 in helpers.scan(es, index=index1, doc_type=doc_type1): _id = res1['_id'] if type1 == 'geo_point': if proj: poly1 = Point( proj(res1['_source'][geo_field1][0], res1['_source'][geo_field1][1])) else: poly1 = Point(res1['_source'][geo_field1]) else: if res1['_source'][geo_field1]['type'].lower() == 'point': if proj: poly1 = Point( proj(res1['_source'][geo_field1]['coordinates'][0], res1['_source'][geo_field1]['coordinates'][1])) else: poly1 = Point(res1['_source'][geo_field1]['coordinates']) else: if proj: coords = [ proj(lng, lat) for lng, lat in res1['_source'] [geo_field1]['coordinates'][0] ] poly1 = Polygon(coords) else: poly1 = Polygon( res1['_source'][geo_field1]['coordinates'][0]) idx += 1 min_dist = float('inf') closest = None for res2 in list(helpers.scan(es, index=index2, doc_type=doc_type2))[:5]: if type2 == 'geo_point': if proj: poly2 = Point( proj(res2['_source'][geo_field2][0], res2['_source'][geo_field2][1])) else: poly2 = Point(res2['_source'][geo_field2]) else: if res2['_source'][geo_field2]['type'].lower() == 'point': if proj: poly2 = Point( proj( res2['_source'][geo_field2]['coordinates'][0], res2['_source'][geo_field2]['coordinates'][1])) else: poly2 = Point( res2['_source'][geo_field2]['coordinates']) else: if proj: coords = [ proj(lng, lat) for lng, lat in res2['_source'] [geo_field2]['coordinates'][0] ] poly2 = Polygon(coords) else: poly2 = Polygon( res2['_source'][geo_field2]['coordinates'][0]) c1, c2 = poly1.centroid.coords[0], poly2.centroid.coords[0] dist = math.sqrt((c1[0] - c2[0])**2 + (c1[1] - c2[1])**2) if dist < min_dist: min_dist = dist closest = res2['_id'] new_doc = deepcopy(res1['_source']) new_doc['closest_%s' % doc_type2] = closest updates.append(new_doc) if idx >= 10000: upload_to_Elasticsearch.update_ES_records_curl(updates, index=index1, doc_type=doc_type1, id_field=id_field1) idx = 0 updates = [] #upload remaining records upload_to_Elasticsearch.update_ES_records_curl(updates, index=index1, doc_type=doc_type1, id_field=id_field1)