class InstagramSearcher: """Creates an Instagram API client and returns search results""" def __init__(self): self.client = InstagramAPI(client_id=os.environ['EDUC8_IG_CLIENT_ID'], client_secret=os.environ['EDUC8_IG_CLIENT_SECRET']) self.search_results = [] def get_ig_locations(self, lat, lng): return self.client.location_search(lat=lat, lng=lng) def search_all_locations(self, locations): for location in locations: try: results = self.client.location_recent_media(location_id = location.id) for media in results[0]: result = {'source': 'Instagram'} result['location_name'] = location.name result['created'] = media.created_time result['username'] = media.user.username result['avatar'] = media.user.profile_picture if media.caption: result['caption'] = media.caption.text else: result['caption'] = "" result['url'] = media.images['standard_resolution'].url result['ig_shortcode'] = get_ig_shortcode(media.link) # TODO: this helper method is used by both IG and TwitterSearcher objects, how to make this work? self.search_results.append(result) except InstagramAPIError as e: print e except Exception as ex: print ex def get_photo_url_from_shortcode(self, shortcode): ig_result = self.client.media_shortcode(shortcode) return ig_result.images['standard_resolution'].url
class VenuePhotoCrawlerInstagram: def __init__(self): self.client = InstagramAPI( client_id=config.instagram_client_id, client_secret=config.instagram_client_secret) def fetch_instagram_id(self, foursquare_id): print 'foursquare id is ', foursquare_id res = self.client.location_search(foursquare_v2_id=foursquare_id) return res[0].id def show_popular(self): popular_media = self.client.media_popular(20) for media in popular_media: print media.caption.text def grab_photos(self, foursquare_id, max_pages, min_timestamp): try: instagram_id = self.fetch_instagram_id(foursquare_id) gen = self.client.location_recent_media( count=200, location_id=instagram_id, as_generator=True, max_pages=max_pages, min_timestamp=min_timestamp) page_cnt = 0 except: return for page in gen: save_photo_instagram(page[0], foursquare_id, instagram_id) print 'fetching page', page_cnt page_cnt += 1 time.sleep(config.instagram_API_pause)
def location_search(lat,lng): location_id_list = [] try: api = InstagramAPI(access_token=access_token, client_secret=client_secret) locations = api.location_search(lat=lat,lng=lng,distance=5000) for location in locations: location_id_list.append(location.id) except Exception as e: print(e) return location_id_list
def fetch_instagram(client_id, client_secret, lat=None, lng=None, foursquare_id=None): api = InstagramAPI(client_id=client_id, client_secret=client_secret) try: l = api.location_search(lat=lat, lng=lng, foursquare_v2_id=foursquare_id) ids = [o.id for o in l] images=[] for lid in ids: i, _ = api.location_recent_media(location_id=lid) for img in i: pic = img.images['standard_resolution'] images.append((pic.url, pic.width, pic.height)) return images except InstagramAPIError as e: return []
class InstagramScraper(object): def __init__(self): self._connection = settings.mongodb_connection self._config = self._connection.data_mining_system.consts.find_one( {"name": "instagram"}) client_id = str(self._config["access_keys"][0]["client_id"]) client_secret = str(self._config["access_keys"][0]["client_secret"]) self._api = InstagramAPI(client_id=client_id, client_secret=client_secret) self._max_results = self._config.get("max_results", MAX_RESULTS) self._max_distance = self._config.get("max_distance", MAX_DISTANCE) def find_location(self, q, lat, lng, source_name=None, source_id=None): arguments = dict( q=q, count=self._max_results, distance=self._max_distance, lat=lat, lng=lng, ) if source_name == "foursquare": arguments["FOURSQUARE_V2_ID"] = source_id elif source_name == "facebook": arguments["FACEBOOK_PLACES_ID"] = source_id results = self._api.location_search(**arguments) return results def find_locations(self, poi_data): source_ids = poi_data.get('source_ids', {}) foursquare_ids = source_ids.get('foursquare', []) facebook_ids = source_ids.get('facebook', []) poi_name = text_utils.clean_text(poi_data["name"]) lat, lng = poi_data["coordinates"] results = self.find_location(poi_name, lat, lng) return results def search_poi(self, poi_data): return None def match_poi(self, poi_data, search_results): return None def get_poi(self, poi_data): return None def find_poi(self, poi_data): return None
class InstagramScraper(object): def __init__(self): self._connection = settings.mongodb_connection self._config = self._connection.data_mining_system.consts.find_one({"name": "instagram"}) client_id = str(self._config["access_keys"][0]["client_id"]) client_secret = str(self._config["access_keys"][0]["client_secret"]) self._api = InstagramAPI(client_id = client_id, client_secret = client_secret) self._max_results = self._config.get("max_results", MAX_RESULTS) self._max_distance = self._config.get("max_distance", MAX_DISTANCE) def find_location(self, q, lat, lng, source_name = None, source_id = None): arguments = dict( q = q, count = self._max_results, distance = self._max_distance, lat = lat, lng = lng, ) if source_name == "foursquare": arguments["FOURSQUARE_V2_ID"] = source_id elif source_name =="facebook": arguments["FACEBOOK_PLACES_ID"] = source_id results = self._api.location_search(**arguments) return results def find_locations(self, poi_data): source_ids = poi_data.get('source_ids', {}) foursquare_ids = source_ids.get('foursquare', []) facebook_ids = source_ids.get('facebook', []) poi_name = text_utils.clean_text(poi_data["name"]) lat, lng = poi_data["coordinates"] results = self.find_location(poi_name, lat, lng) return results def search_poi(self, poi_data): return None def match_poi(self, poi_data, search_results): return None def get_poi(self, poi_data): return None def find_poi(self, poi_data): return None
class VenuePhotoCrawlerInstagram: def __init__(self): self.client = InstagramAPI(client_id = config.instagram_client_id, client_secret = config.instagram_client_secret) def fetch_instagram_id(self, foursquare_id): print 'foursquare id is ',foursquare_id res = self.client.location_search(foursquare_v2_id=foursquare_id) return res[0].id def show_popular(self): popular_media = self.client.media_popular(20) for media in popular_media: print media.caption.text def grab_photos(self, foursquare_id, max_pages): try: instagram_id = self.fetch_instagram_id(foursquare_id) gen = self.client.location_recent_media(count=200, location_id = instagram_id, as_generator=True, max_pages=max_pages)#, return_json=True) page_cnt = 0 except: return for page in gen: save_photo_instagram(page[0], foursquare_id, instagram_id) print 'fetching page',page_cnt page_cnt+=1 time.sleep(config.instagram_API_pause)
def images_geo_json(): api = InstagramAPI(access_token=access_token, client_secret=client_secret) json_locations = api.location_search(q=5000, count=None, lat=55.770968, lng=38.680028, foursquare_id=None, foursquare_v2_id=None) html_code = '''<html><body>''' for location in json_locations: media = api.location_recent_media(5, None, location.id) for element in media: if isinstance(element, list): html_code += images_geo(element, html_code, location.id) elif isinstance(element, Media): if image_ids.count(element.id) == 0: image_ids.append(element.id) html_code += '''<img src=''' + element.images['standard_resolution'].url \ + ''' alt=''' + location.id + '''>''' html_code += '''</body></html>''' print image_ids print html_code return html_code
from instagram.client import InstagramAPI api = InstagramAPI(client_id='ade077a508f241b599aa55d924730a10', client_secret='85a2c94c85d844b79d39e86e7d8d84a7') from itertools import product import numpy as np coordinates = list( product(np.arange(50.40, 50.47, 0.005), np.arange(30.47, 30.60, 0.001))) print coordinates users = set() for coord in coordinates: try: locations_list = api.location_search(lat=coord[0], lng=coord[1]) for location in locations_list: medias = api.location_recent_media(location_id=location.id)[0] for media in medias: print media.user except: pass
def locations(lt, lg): api = InstagramAPI(client_id='53ff568efcd6492eb9b88c7b92a615b4', client_secret='3649d3a0675647b1839a5aa580a10dbc') locations = api.location_search(lat=lt, lng=lg) return locations
class InstagramCrawler: def __init__(self): self.api = InstagramAPI( access_token=ACCESS_TOKEN, client_id = CLIENT_ID, client_secret = CLIENT_SECRET) self.clean_dir_csv() def clean_dir_csv(self): if os.path.exists(OUTPUT_CSV): os.remove(OUTPUT_CSV) if os.path.exists(OUTPUT_IMAGE_DIR): files = glob.glob(OUTPUT_IMAGE_DIR + '/*.jpg') for path in files: os.remove(path) else: os.mkdir(OUTPUT_IMAGE_DIR) if os.path.exists(OUTPUT_IMAGE320_DIR): files = glob.glob(OUTPUT_IMAGE320_DIR + '/*.jpg') for path in files: os.remove(path) else: os.mkdir(OUTPUT_IMAGE320_DIR) def search_instagram(self): location_ids, location_lls = self.__search_location_ids() count = 0 for location_id,location_ll in zip(location_ids,location_lls): max_id = "" next = True while(not next is None): media_ids,next = self.api.location_recent_media( count = 30, location_id = location_id, max_id = max_id) with open(OUTPUT_CSV, 'w') as csvfile: #writer = csv.writer(csvfile, delimiter=str(','), quoting=csv.QUOTE_MINIMAL) writer = csv.writer(csvfile, delimiter=str(','), quoting=csv.QUOTE_MINIMAL) writer.writerow(['origin_url', 'file_name', 'latitude', 'longitude', 'location_id', 'tags']) for media_id in media_ids: thumb_url = media_id.images['thumbnail'].url thumb320_url = media_id.images['low_resolution'].url print "now %d downloading" % count try: r = requests.get(thumb_url) r2 = requests.get(thumb320_url) if r.status_code == 200 and r2.status_code == 200: file_name = "%04d.jpg" % count path = "{0}/{1}".format(OUTPUT_IMAGE_DIR, file_name) self.__save_image(path, r.content) path = "{0}/{1}".format(OUTPUT_IMAGE320_DIR, file_name) self.__save_image(path, r2.content) tags = [tag.name for tag in media_id.tags] print tags tags = unicode(','.join(tags)).encode('utf_8') print writer.writerow([thumb_url, file_name, location_ll[0], location_ll[1], location_id, tags]) print file_name except Exception as e: print type(str(e)) print e.message pass if not next is None: temp, max_location_id = next.split("max_id=") max_id = str(max_location_id) count += 1 next = None def __search_location_ids(self): media_ids = self.api.location_search( count = LOCATION_COUNT, lat = LAT, lng = LNG, distance = DISTANCE) return [ [media_id.id for media_id in media_ids], [[media_id.point.latitude, media_id.point.longitude] for media_id in media_ids]] def __save_image(self, save_path, img_contents): f = open(save_path, "wb") f.write(img_contents) f.close()
tags = [] for media in listMedia: for mediaTag in media.tags: tags.append(mediaTag.name) return Counter(tags) def getMedia(locationId): medias = api.location_recent_media(location_id=locationId) return medias[0] bestLocations = []; latD=48.858844 lonD=2.294351 for x in range(-10, 10): for z in range(-10,10): print(x,z) locations = api.location_search(lat=48.858844+x*0.001, lng=2.294351+z*0.001) for location in locations: likes = 0 if not any(d['name'] == location.name for d in bestLocations): images = getMedia(location.id) likes = getNbLikes(images) tags = getTags(images) if len(images)>0 : bestLocations.append(dict(name=location.name,latitude=location.point.latitude,longitude=location.point.longitude,likes=likes,tags=tags,id=location.id,nbrImages=len(images))) finalData = pd.DataFrame.from_dict(bestLocations) finalData.to_csv('instadata.csv', sep='\t', encoding='utf-8')
lat=data['latitude'] stars=data['stars'] review_counts=data['review_count'] places.append([name,lon,lat,stars,review_counts]) distance=2 count=1 combine=[] for i in range(len(places)): lon=places[i][1] lat=places[i][2] stars=places[i][3] review_counts=places[i][4] yelp_name=places[i][0] location=api.location_search(distance,count,lat,lon) for place in location: if check_name(yelp_name,place.name): recent_media, next= api.location_recent_media(location_id=place.id) if len(recent_media)!=0: combine.append([yelp_name,stars,review_counts,len(recent_media)]) outfile = open('finalProjectPA_WI.csv','w') outfile.write('Name,Yelp_stars,Yelp_review,Instagram_checkin\n') for item in combine: line=item[0]+','+str(item[1])+','+str(item[2])+','+str(item[3]) outfile.write(unicode(line).encode('utf-8')+'\n') outfile.close()
class InstaHandler(object): def __init__(self): # Connect to Instagram API self.orig_client = True # Set parameters in params.py self.api = InstagramAPI(access_token=params.access_token, client_secret=params.client_secret) self.itercount = 0 self.PHOTO_LIST = [] self.LOCATION_LIST = [] def find_locations(self, lat, lng, radius): try: locations = self.api.location_search(lat=lat, lng=lng, distance=radius, count=33) #print(len(locations)) # If response hits the limit # Limit = 33 if len(locations) == 33: print 'Response hits limit of 33. Not all locations were returned.' for loc in locations: self.LOCATION_LIST.append({ "name": loc.name, "id": loc.id, "lat": loc.point.latitude, "lon": loc.point.longitude }) except Exception as e: print e if e.status_code == '429': print 'Limit exceeded.' def photo_in_location(self, location_id, max_id): try: if max_id == 0: medias = self.api.location_recent_media(location_id=location_id, count=33) else: medias = self.api.location_recent_media(location_id=location_id, max_id=max_id, count=33) self.itercount += 1 except Exception, e: print e if e.status_code == '429': print 'Limit exceeded.' # Check if there are more photos to download if len(medias[0]) > 0: cont = True print medias[0][0].location.name for media in medias[0]: # Discard videos now if media.type == 'image': tags_arr = [] for tag in media.tags: tags_arr.append(tag.name) caption = media.caption if caption == None: text = None else: text = caption.text try: self.PHOTO_LIST.append({ "id": media.id.split('_')[0], "username": unicode(media.user.username).encode('utf8'), "user_id": media.user.id, "likes": media.like_count, "comments": media.comment_count, "tagged_users": len(media.users_in_photo), "filter": media.filter, "caption": unicode(text).encode('utf8'), "url": media.link, "photo_url": media.images['standard_resolution'].url, "location_id": media.location.id, "created_at": str(media.created_time), "tags": tags_arr }) except AttributeError: continue if media.created_time < DATE_LIMIT or self.itercount > 5: cont = False # Recursively continue downloading, pass last media id if cont: self.photo_in_location(location_id, media.id.split('_')[0])
from instagram.client import InstagramAPI api = InstagramAPI(client_id='ade077a508f241b599aa55d924730a10', client_secret='85a2c94c85d844b79d39e86e7d8d84a7') from itertools import product import numpy as np coordinates = list(product(np.arange(50.40,50.47,0.005), np.arange(30.47,30.60,0.001))) print coordinates users = set() for coord in coordinates: try: locations_list = api.location_search(lat=coord[0],lng=coord[1]) for location in locations_list: medias = api.location_recent_media(location_id=location.id)[0] for media in medias: print media.user except: pass
return Counter(tags) def getMedia(locationId): medias = api.location_recent_media(location_id=locationId) return medias[0] bestLocations = [] latD = 48.858844 lonD = 2.294351 for x in range(-10, 10): for z in range(-10, 10): print(x, z) locations = api.location_search(lat=48.858844 + x * 0.001, lng=2.294351 + z * 0.001) for location in locations: likes = 0 if not any(d['name'] == location.name for d in bestLocations): images = getMedia(location.id) likes = getNbLikes(images) tags = getTags(images) if len(images) > 0: bestLocations.append( dict(name=location.name, latitude=location.point.latitude, longitude=location.point.longitude, likes=likes, tags=tags, id=location.id, nbrImages=len(images)))
client_id_ins = keys_insta_loc.ix[ind_key, 'id'] client_secret_ins = keys_insta_loc.ix[ind_key, 'secret'] access_token_ins = "284875445.cac25a6.626e3962134f453bb27180c390c8b6ac" api = InstagramAPI(client_id = client_id_ins, client_secret = client_secret_ins) venueid_4sq_num = pd.read_csv('dataset/'+ city + '/splitdata/' + city + 'venueid_4sq_num'+filename, header = None) venueid_4sq_num.columns = ['venueid_4sq', 'venueid_number'] # assignt table name for i in range(len(venueid_4sq_num)): try: ivenueid_4sq = venueid_4sq_num.ix[i, 'venueid_4sq'] ivenueid_number = venueid_4sq_num.ix[i, 'venueid_number'] insta_4sq = api.location_search(foursquare_v2_id = ivenueid_4sq) if len(insta_4sq) > 0: ivenueid_insta = insta_4sq[0].id fid = open('dataset/'+ city + '/splitdata/' + city + 'venueid_4sq_insta'+filename, 'a') fid.write(str(ivenueid_number) + ',' + ivenueid_4sq + ',' + ivenueid_insta) fid.write('\n') fid.close() fid = open('dataset/'+ city + '/splitdata/' + city + 'big'+filename, 'a') fid.write(str(insta_4sq[0].point.latitude) + ',' + str(insta_4sq[0].point.longitude)) fid.write('\n') fid.close() except BaseException, e: print str(e) print filename, i if str(e) == '(429) Rate limited-Your client is making too many request per second':
class insta_stuff(object): def __init__(self): # Connect to Instagram API self.api = InstagramAPI(access_token=params.access_token, client_secret=params.client_secret) # Init OGR driver self.drv = ogr.GetDriverByName('KML') # Connect to Postgres self.conn = psycopg2.connect(host=params.pg_host, port=params.pg_port, user=params.pg_user, password=params.pg_pass, dbname=params.pg_db) self.cursor = self.conn.cursor() self.conn2 = psycopg2.connect(host=params.pg_host, port=params.pg_port, user=params.pg_user, password=params.pg_pass, dbname=params.pg_db) self.cursor2 = self.conn2.cursor() self.conn3 = psycopg2.connect(host=params.pg_host, port=params.pg_port, user=params.pg_user, password=params.pg_pass, dbname=params.pg_db) self.upload_cursor = self.conn3.cursor() def test(self): user_id = 5006616 recent_media, next_ = self.api.user_recent_media(user_id=user_id, count=10) for media in recent_media: print media.caption.text def upload_student_areas(self, path, kml): sql = 'INSERT INTO student_areas (id, poly_id, geom, bbox) VALUES (%s, %s, ST_SetSRID(ST_GeomFromText(%s),4326), ST_Envelope(ST_SetSRID(ST_GeomFromText(%s),4326)))' student_id = kml.split('.')[0] file = self.drv.Open(path + kml) for layer in file: for feature in layer: feature.geometry().FlattenTo2D() geom = feature.geometry().ExportToWkt() self.cursor.execute(sql, (str(student_id), str(i), geom, geom)) self.conn.commit() # Generate a grid in PostgreSQL and calls Location/search API method for all grid points # If the response hits the limit, refines grid locally to get all Locations ### DEBUG duplicates due to overlaps in query circles!!!! def get_locations(self, radius): # Get a grid within a polygon to draw circles # GRID SPACING: x # radius: (sqrt(2) * x) / 2 -> no holes # Start with reasonably big radius, only decrease it when the response hits the limit spacing = int(radius * 2 / sqrt(2)) # Generate grid to query Locations sql = 'SELECT makegrid(geom, %s) FROM student_areas' self.cursor.execute(sql, (spacing, )) for rec in self.cursor: tmp = eval(rec[0]) lng, lat =tmp[0], tmp[1] # Query Locations on current grid point locations = self.api.location_search(lat=lat, lng=lng, distance=radius, count=100) print(len(locations)) # If response hits the limit # Draw a circle with current radius on the current grid point and do a finer search if len(locations) == 33: print "FINER GRIDDD" self.get_finer(lat, lng, radius) else: self.upload_locations(locations) def get_finer(self, lat, lng, radius): sql = 'SELECT makegrid(ST_Buffer(ST_Transform(ST_SetSRID(ST_MakePoint(%s,%s),4326),3786),%s),%s)' self.cursor2.execute(sql, (lng, lat, radius, radius/2)) for rec in self.cursor2: tmp = eval(rec[0]) lng, lat = tmp[0], tmp[1] locations = self.api.location_search(lat=lat, lng=lng, distance=radius, count=100) # Recursively refine grid if neccessary if len(locations) == 33: self.get_finer(lat, lng, radius/2) else: self.upload_locations(locations) def upload_locations(self, locations): sql = 'INSERT INTO instagram_locations (id, name, lat, lng, geom) VALUES (%s, %s, %s, %s, ST_SetSRID(ST_MakePoint(%s, %s),4326))' for loc in locations: self.upload_cursor.execute(sql, (loc.id, loc.name, loc.point.latitude, loc.point.longitude, loc.point.longitude, loc.point.latitude)) self.conn3.commit()
#Print the media to the html code for media in totalFollowers: #first element of tuple contains media addImageHTML(media) createHTMLTemplate() # Embed links of all photos at lat/long location in html searchResults = api.media_search(count=MAXRESULTS, lat=latitude,lng=longitude, distance=DISTANCE) for media in searchResults: addImageHTML(media) # Get photos from location based on foursquareID if FOURSQUAREID: outputFile.write("<br><h1>Second Search (based on foursquare ID):</h1><br>\n") searchResults = api.location_search(count=MAXRESULTS, foursquare_v2_id=FOURSQUAREID, distance=DISTANCE) findMediaAtLocation(searchResults) if not FOURSQUAREID: print "No FOURSQUAREID given - skipping search by Foursquare location" sleep(2) # Embed photos of lat/long and nearby locations # Not as useful as first search if FOURSQUAREID: outputFile.write("<br><h1>Third Search (based on points of interest):</h1><br>\n") if not FOURSQUAREID: outputFile.write("<br><h1>Second Search (based on points of interest):</h1><br>\n") searchResults = api.location_search(count=MAXRESULTS, lat=latitude,lng=longitude, distance=DISTANCE) print "Found " + str(len(searchResults)) + " nearby landmarks to check for pictures near." findMediaAtLocation(searchResults)