def write_json_sf_top5(): obj_name = 'static_10_sf_city_larger' sf_data = _load_obj(obj_name) coordinates, address, retsly_ids, house_img_urls, descriptions = pipeline.query_retsly_v1() # totall 11 # select top 5 locations post_nums = [] instagram_top_5_house = dict() for k,v in enumerate(sf_data): print k,v # v is id for one listing idx = retsly_ids.index(v) print retsly_ids[idx], address[idx], house_img_urls[idx], descriptions[idx] #urls, text = pipeline.post_retrieval(sf_data[v], query) instgram_data = sf_data[v] post_nums.append(len(instgram_data)) if len(instgram_data) >= 196: instagram_top_5_house[v] = instgram_data sf_test_top_5_house = [] for k, v in enumerate(instagram_top_5_house): print k, v idx = retsly_ids.index(v) inst_data = instagram_top_5_house[v] inst_img_urls = "" inst_captions = "" count = 0 for k2, v2 in enumerate(inst_data): inst_one = inst_data[v2] if inst_one['caption'] == None or inst_one['img'] == None: continue inst_captions += inst_one['caption']['text'] inst_captions += '_!@#*()_' inst_img_urls += inst_one['img'] inst_img_urls += '_!@#*()_' sf_test_top_5_house.append({'name' : retsly_ids[idx], 'latitude' : float(coordinates[idx][1]), 'longitude' : float(coordinates[idx][0]), 'url' : house_img_urls[idx], 'description' : descriptions[idx], 'address' : address[idx], 'inst_captions' : inst_captions, 'inst_img_urls' : inst_img_urls}) # store house info to json pdb.set_trace() with open('sf_test_top_5_house_v2.json', 'w') as fp: json.dump(sf_test_top_5_house, fp) pdb.set_trace()
def single_address_full_pipeline(pipeline, query): # step 1: query retsly to get listing geolocation coordinates, address, retsly_ids, house_image_urls, descriptions = pipeline.query_retsly_v1() pdb.set_trace() sf_data = _load_obj('static_10_sf_city_larger') max_listing = 3 ids = 0 for k,v in enumerate(sf_data): print k,v # v is id for one listing idx = retsly_ids.index(v) print retsly_ids[idx], address[idx] pdb.set_trace() # 0358d2ba3e8f74256e993f5b398cb2b1 urls, text = pipeline.post_retrieval(sf_data[v], query) pdb.set_trace() ids += 1 if ids >= max_listing: break return urls, text
def sf_batch_full_pipeline(pipeline, query): # step 1: query retsly to get listing geolocation coordinates, address, retsly_ids = pipeline.query_retsly_v1() # get instgram media data #geolocation_tuple = [-122.4120249,37.73691843] #current_loc_trulia = [-122.3981040, 37.788900] # step 2: query instagram to get social media data try: sf_data = _load_obj('static_10_sf_city') except: pipeline.query_instagram_media(coordinates, retsly_ids, 10) # step 3: build wordvec similarities and topics from each listing for k,v in enumerate(sf_data): print k,v # v is id for one listing idx = retsly_ids.index(v) print retsly_ids[idx], address[idx] urls, text = pipeline.post_retrieval(sf_data[v], query) pdb.set_trace()
def more_sf_data(save_name): coordinates, address, retsly_ids = pipeline.query_retsly_v1() try: sf_data = _load_obj(save_name) except: pipeline.query_instagram_media(coordinates, retsly_ids, save_name, 10)