Ejemplo n.º 1
0
def scrape(stadium, overwrite=False):
    print('Searching for {}'.format(stadium))
    if stadiums.find_one({'venueName': stadium}):
        print('Already in db')
        return True

    # Add string stadium to name of the stadium for wikipedia search, take most popular result as correct
    wiki_name = wikipedia.search(html.unescape(stadium + ' stadium'))
    geocode_result = gmaps.geocode(stadium)
    if geocode_result:
        elevation_result = gmaps.elevation(
            convert.normalize_lat_lng(
                geocode_result[0]['geometry']['location']))
        stadium_data_to_insert = {
            'venueName': stadium,
            'location_data': geocode_result[0],
            'altitude_data': elevation_result[0]
        }
        stadiums.replace_one({'venueName': stadium},
                             stadium_data_to_insert,
                             upsert=True)
    if wiki_name:
        wiki_page = wptools.page(wiki_name[0], silent=True)
        wiki_page.get_parse()
        wiki_data = wiki_page.data['infobox']
        wiki_data_to_insert = {
            'venueName': stadium,
            'wiki_name': wiki_name[0],
            'wiki_data': wiki_data
        }
        wiki.replace_one({'venueName': stadium},
                         wiki_data_to_insert,
                         upsert=True)
Ejemplo n.º 2
0
def scrape_stadium_manually(OPTAvenueName, alternativeName, overwrite=False):
    print('Searching for {}'.format(OPTAvenueName))
    if stadiums.find_one({'venueName': OPTAvenueName}) and not overwrite:
        print('Already in db')
        return True

    wiki_name = wikipedia.search(html.unescape(alternativeName + ' stadium'))
    geocode_result = gmaps.geocode(alternativeName)
    if geocode_result:
        elevation_result = gmaps.elevation(
            convert.normalize_lat_lng(
                geocode_result[0]['geometry']['location']))
        stadium_data_to_insert = {
            'venueName': OPTAvenueName,
            'location_data': geocode_result[0],
            'altitude_data': elevation_result[0]
        }
        stadiums.replace_one({'venueName': OPTAvenueName},
                             stadium_data_to_insert,
                             upsert=True)
    if wiki_name:
        wiki_page = wptools.page(wiki_name[0], silent=True)
        wiki_page.get_parse()
        wiki_data = wiki_page.data['infobox']
        wiki_data_to_insert = {
            'venueName': OPTAvenueName,
            'wiki_name': wiki_name[0],
            'wiki_data': wiki_data
        }
        wiki.replace_one({'venueName': OPTAvenueName},
                         wiki_data_to_insert,
                         upsert=True)
Ejemplo n.º 3
0
 def on_data(self, data):
     try:
         if self.count < self.limit:
             tweet = json.loads(data)
             if tweet['lang'] == 'en' and tweet['user'].get('location') is not None:
                 place = tweet['user'].get('location')
                 if place:
                     tweet_id = str(tweet['id'])
                     geocode_result = gmaps.geocode(place)
                     lat = geocode_result[0]['geometry']['location']['lat']
                     lng = geocode_result[0]['geometry']['location']['lng']
                     tweet_text = tweet['text'].lower().encode('ascii', 'ignore').decode('ascii')
                     raw_tweet = {
                         'user': tweet['user']['screen_name'],
                         'text': tweet_text,
                         'place': place,
                         'coordinates': {'location': str(lat)+","+str(lng)}, 
                         'time': tweet['created_at'],
                         'category': get_category(tweet_text)
                     }
                     es.index(index=ES_INDEX, doc_type=ES_TYPE, id=tweet_id, body=raw_tweet)
             self.count += 1
         else:
             stream.disconnect()
     except Exception as e:
         pass
Ejemplo n.º 4
0
    def get_geocode_from_address(address):
        """
        Retrieve data of a place from an address.

        :param address: the address of a place, formatted as commonly written in Belgium.
        :type address: string

        :return geocode_result: The data of a place, if the place exists.
        :rtype geocode_result: dictionary
        """
        try:
            geocode_result = gmaps.geocode(address)
        except (TransportError, Timeout):
            raise InvalidLocation('The entered location is invalid.')
        else:
            return geocode_result
Ejemplo n.º 5
0
    tso.set_language('en')

    ts = TwitterSearch(consumer_key=TWITTER_CON_ACCESS,
                       consumer_secret=TWITTER_CON_SECRET,
                       access_token=TWITTER_ACCESS,
                       access_token_secret=TWITTER_SECRET)

    count = 0

    for tweet in ts.search_tweets_iterable(tso):
        try:
            if tweet['user'].get('location') is not None:
                place = tweet['user'].get('location')
                if place:
                    tweet_id = str(tweet['id'])
                    geocode_result = gmaps.geocode(place)
                    lat = geocode_result[0]['geometry']['location']['lat']
                    lng = geocode_result[0]['geometry']['location']['lng']
                    tweet_text = tweet['text'].lower().encode(
                        'ascii', 'ignore').decode('ascii')
                    raw_tweet = {
                        'user': tweet['user']['screen_name'],
                        'text': tweet_text,
                        'place': place,
                        'coordinates': {
                            'location': str(lat) + "," + str(lng)
                        },
                        'time': tweet['created_at'],
                        'category': get_category(tweet_text)
                    }
                    es.index(index=ES_INDEX,
Ejemplo n.º 6
0
    tso.set_keywords(['sports'])
    tso.set_language('en')

    ts = TwitterSearch(consumer_key=TWITTER_CON_ACCESS,
                       consumer_secret=TWITTER_CON_SECRET,
                       access_token=TWITTER_ACCESS,
                       access_token_secret=TWITTER_SECRET)

    count = 0

    for tweet in ts.search_tweets_iterable(tso):
        try:
            if tweet['user'].get('location') is not None:
                location = tweet['user'].get('location')
                tweet_id = str(tweet['id'])
                geocode_result = gmaps.geocode(location)
                print geocode_result
                tweet_text = tweet['text'].lower().encode(
                    'ascii', 'ignore').decode('ascii')
                raw_tweet = {
                    'user': tweet['user']['screen_name'],
                    'text': tweet_text,
                    'location': tweet['user']['location'],
                    'coordinates': geocode_result,
                    'time': tweet['created_at'],
                    'category': get_category(tweet_text)
                }
                es.index(index=ES_INDEX,
                         doc_type=ES_TYPE,
                         id=tweet_id,
                         body=raw_tweet)