def scrape(stadium, overwrite=False): print('Searching for {}'.format(stadium)) if stadiums.find_one({'venueName': stadium}): print('Already in db') return True # Add string stadium to name of the stadium for wikipedia search, take most popular result as correct wiki_name = wikipedia.search(html.unescape(stadium + ' stadium')) geocode_result = gmaps.geocode(stadium) if geocode_result: elevation_result = gmaps.elevation( convert.normalize_lat_lng( geocode_result[0]['geometry']['location'])) stadium_data_to_insert = { 'venueName': stadium, 'location_data': geocode_result[0], 'altitude_data': elevation_result[0] } stadiums.replace_one({'venueName': stadium}, stadium_data_to_insert, upsert=True) if wiki_name: wiki_page = wptools.page(wiki_name[0], silent=True) wiki_page.get_parse() wiki_data = wiki_page.data['infobox'] wiki_data_to_insert = { 'venueName': stadium, 'wiki_name': wiki_name[0], 'wiki_data': wiki_data } wiki.replace_one({'venueName': stadium}, wiki_data_to_insert, upsert=True)
def scrape_stadium_manually(OPTAvenueName, alternativeName, overwrite=False): print('Searching for {}'.format(OPTAvenueName)) if stadiums.find_one({'venueName': OPTAvenueName}) and not overwrite: print('Already in db') return True wiki_name = wikipedia.search(html.unescape(alternativeName + ' stadium')) geocode_result = gmaps.geocode(alternativeName) if geocode_result: elevation_result = gmaps.elevation( convert.normalize_lat_lng( geocode_result[0]['geometry']['location'])) stadium_data_to_insert = { 'venueName': OPTAvenueName, 'location_data': geocode_result[0], 'altitude_data': elevation_result[0] } stadiums.replace_one({'venueName': OPTAvenueName}, stadium_data_to_insert, upsert=True) if wiki_name: wiki_page = wptools.page(wiki_name[0], silent=True) wiki_page.get_parse() wiki_data = wiki_page.data['infobox'] wiki_data_to_insert = { 'venueName': OPTAvenueName, 'wiki_name': wiki_name[0], 'wiki_data': wiki_data } wiki.replace_one({'venueName': OPTAvenueName}, wiki_data_to_insert, upsert=True)
def on_data(self, data): try: if self.count < self.limit: tweet = json.loads(data) if tweet['lang'] == 'en' and tweet['user'].get('location') is not None: place = tweet['user'].get('location') if place: tweet_id = str(tweet['id']) geocode_result = gmaps.geocode(place) lat = geocode_result[0]['geometry']['location']['lat'] lng = geocode_result[0]['geometry']['location']['lng'] tweet_text = tweet['text'].lower().encode('ascii', 'ignore').decode('ascii') raw_tweet = { 'user': tweet['user']['screen_name'], 'text': tweet_text, 'place': place, 'coordinates': {'location': str(lat)+","+str(lng)}, 'time': tweet['created_at'], 'category': get_category(tweet_text) } es.index(index=ES_INDEX, doc_type=ES_TYPE, id=tweet_id, body=raw_tweet) self.count += 1 else: stream.disconnect() except Exception as e: pass
def get_geocode_from_address(address): """ Retrieve data of a place from an address. :param address: the address of a place, formatted as commonly written in Belgium. :type address: string :return geocode_result: The data of a place, if the place exists. :rtype geocode_result: dictionary """ try: geocode_result = gmaps.geocode(address) except (TransportError, Timeout): raise InvalidLocation('The entered location is invalid.') else: return geocode_result
tso.set_language('en') ts = TwitterSearch(consumer_key=TWITTER_CON_ACCESS, consumer_secret=TWITTER_CON_SECRET, access_token=TWITTER_ACCESS, access_token_secret=TWITTER_SECRET) count = 0 for tweet in ts.search_tweets_iterable(tso): try: if tweet['user'].get('location') is not None: place = tweet['user'].get('location') if place: tweet_id = str(tweet['id']) geocode_result = gmaps.geocode(place) lat = geocode_result[0]['geometry']['location']['lat'] lng = geocode_result[0]['geometry']['location']['lng'] tweet_text = tweet['text'].lower().encode( 'ascii', 'ignore').decode('ascii') raw_tweet = { 'user': tweet['user']['screen_name'], 'text': tweet_text, 'place': place, 'coordinates': { 'location': str(lat) + "," + str(lng) }, 'time': tweet['created_at'], 'category': get_category(tweet_text) } es.index(index=ES_INDEX,
tso.set_keywords(['sports']) tso.set_language('en') ts = TwitterSearch(consumer_key=TWITTER_CON_ACCESS, consumer_secret=TWITTER_CON_SECRET, access_token=TWITTER_ACCESS, access_token_secret=TWITTER_SECRET) count = 0 for tweet in ts.search_tweets_iterable(tso): try: if tweet['user'].get('location') is not None: location = tweet['user'].get('location') tweet_id = str(tweet['id']) geocode_result = gmaps.geocode(location) print geocode_result tweet_text = tweet['text'].lower().encode( 'ascii', 'ignore').decode('ascii') raw_tweet = { 'user': tweet['user']['screen_name'], 'text': tweet_text, 'location': tweet['user']['location'], 'coordinates': geocode_result, 'time': tweet['created_at'], 'category': get_category(tweet_text) } es.index(index=ES_INDEX, doc_type=ES_TYPE, id=tweet_id, body=raw_tweet)