def get_location(title, main_text, url): locations_list = [] cities = '' countries = '' splited_url = re.split(r'\/|\-', url) splited_url = " ".join(splited_url) places = geograpy.get_place_context(text=main_text) places2 = geograpy.get_place_context(text=title) places3 = geograpy.get_place_context(text=splited_url) cities = places.cities countries = places.countries cities2 = places2.cities countries2 = places2.countries cities3 = places3.cities countries3 = places3.countries if cities: locations_list = get_location_objects_from_cities( cities, locations_list) if cities2: locations_list = get_location_objects_from_cities( cities2, locations_list) if cities3: locations_list = get_location_objects_from_cities( cities3, locations_list) if countries: locations_list = get_location_objects_from_countries( countries, locations_list) if countries2: locations_list = get_location_objects_from_countries( countries2, locations_list) if countries3: locations_list = get_location_objects_from_countries( countries3, locations_list) if not locations_list: # set up as default locatoin - US, unknown city location = {} location['country'] = 'United States' location['location'] = 'unknown' locations_list.append(location) return locations_list
def process_item(self, item, spider): ''' DESCRIPTION: ----------- For each news item, list of countries specified in news text, is fetched by using 'geograpy'. RETURNS: -------- News item with 'countriesMentioned' field updated is returned. ''' try: places = geograpy.get_place_context(url=item['newsUrl']) countryList = [] for country in places.country_mentions: countryList.append(country[0].encode('ascii', 'ignore')) item['countriesMentioned'] = countryList except etree.XMLSyntaxError as e: logging.info('XML Syntax Error' + e) except etree.DocumentInvalid as e: logging.info('XML Document Invalid Error' + e) except Exception: raise DropItem("Failed to extract country mentions from: " + item['newsUrl']) return item
def parseCountries (self, title): countries = geograpy.get_place_context(text=title).countries newCountries = [] for country in countries: if country != HOME_COUNTRY: newCountries.append(country) return newCountries
def locFromText(set_Country, textList, filterList): """ Extract location from twitters :param set_Country: define a country filter (one location name can associated with multiple countries :param textList: A list of all Twitter text selected from database :param filterList: Name list that should not be considered as location under certain event :return: A filtered list of location extracted from Twitter text """ loc = [] print('Start extracting locations from texts') for t in textList: # print(row) text = t[1] if len(text) > 0: text = re.sub(r'[^\w]', ' ', text) # remove symbol places = geograpy.get_place_context(text=text) addStr = places.address_strings for add in addStr: country = add.split(',')[ 2] # get country name from extracted address_strings # print(country) if set_Country in country and not any(e in add for e in filterList): # print('City:', add) loc.append((t[0], add)) return loc
def parseCities (self, title): cities = geograpy.get_place_context(text=title).cities newCities = [] for city in cities: if city != HOME_CITY: newCities.append(city) return newCities
def get_places(self): """" Why i need to open the dataset every time i parse this """ text_input = self.text location_dict={} places = geograpy.get_place_context(text=text_input) for city_dict in CITIES: if city_dict['city'] in places.countries: location_dict[city_dict['city']] = { 'country': city_dict['country'], 'state': city_dict['state'], 'city': city_dict['city'], 'name': city_dict['city'], } if city_dict['state'] in places.countries: location_dict[city_dict['state']] = { 'state': city_dict['state'], 'country': city_dict['country'], 'name': city_dict['state'], } if city_dict['country'] in places.countries: location_dict[city_dict['country']] = { 'country': city_dict['country'], 'name': city_dict['country'], } return list(location_dict.values())
def getGeoLocation(self, newsUrl): ''' DESCRIPTION: ------------ This function finds the country and it's geo location, specified in newsurl. PARAMETERS: ----------- newsurl: URL corresponding to news. RETURNS: -------- 1. geoPoint: Lat Long of country mentioned in newsUrl. 2. country : Country specified in newsUrl. ''' # Set the geo_point places = geograpy.get_place_context(url = newsUrl) geoPoint = [] country = "" try: for country in places.country_mentions: country = country[0].encode('ascii', 'ignore') geolocator = Nominatim() location = geolocator.geocode(country) geoPoint.append(location.longitude) geoPoint.append(location.latitude) break except: geoPoint = [] country = "" return (geoPoint, country)
def get_place(title, desc, url): if desc != None: text = title + desc else: text = title places = get_place_context(text=text) if len(places.cities) > 0: return places.cities[0] elif len(places.countries) > 0: return places.countries[0] places = get_place_context(url=url) if len(places.cities) > 0: return places.cities[0] elif len(places.countries) > 0: return places.countries[0] return 0
def findLocations(corpus): # parsing corpus with geograpy places = geo.get_place_context(text=corpus) # extracting locations print places.countries print places.regions print places.cities print places.other
def main(): # How many pictures to download pic_count = 15 if len(sys.argv) >= 2: pic_count = int(sys.argv[1]) # Create picture directory picture_dir = os.path.join(os.getcwd(), 'pics') create_dir(picture_dir) # Create unknown location directory unknown_location_dir = os.path.join(picture_dir, 'unknown') create_dir(unknown_location_dir) listing = travel_subreddit.hot(limit=pic_count) if len(sys.argv) >= 3 and sys.argv[2] == "--top": listing = travel_subreddit.top(limit=pic_count) # TODO: Preprocess list (remove all non image posts) for submission in listing: # TODO: handle imgur links # Only download jpg if submission.url.endswith('.jpg'): # Combine all comments into one text to search for correct country search_str = "" # Get all top level comments and add to search string for comment in list(submission.comments): if hasattr(comment, 'body'): search_str += comment.body places = geograpy.get_place_context(text=search_str) if places.countries: # Get the country with the highest mentions country = max(places.country_mentions, key=lambda item:item[1])[0] country_dir = os.path.join(picture_dir, country) create_dir(country_dir) else: country = "unknown" # Clean up title for filename words = nltk.word_tokenize(submission.title) space_separated_title = ' '.join(words) underscored_title = space_separated_title.replace(' ', '_') title = re.sub(r'\W+', '', underscored_title) + '.jpg' filepath = os.path.join(picture_dir, country, title) save_image(filepath, submission.url)
def getPlaceET_fromText_NLTK(text): result = list() if not text: return filter(None, result) # You can now access all of the places found by the Extractor places = geograpy.get_place_context(text=text) for place in (places.countries + places.other): c = getISO3166_1code(place) result.append(c) return filter(None, flatten(result))
def getTweetLocation(tweet): ''' Detects tweet's country based on tweet's information. If tweet has 'place' declared, we extract 'country' from it. Else, we check user's location and use geograpy. So, if country is declared we extract it, else if city is declared we assigned it to the country it belongs. :param tweet: tweet :return: ''' place = tweet['place'] # geo = tweet['geo'] #no 'geo' found in tweets # coordinates = tweet['coordinates'] #no 'coordinates' found in tweets #if place exists in tweet, return it if place: #print(place) return tweet['place']['country'] #if no 'place' exists in tweet, we try to get location info from user's location user_loc = tweet['user']['location'] # get user's location #print(user_loc) #if user's location is empty, return None if user_loc == '': return None #try to find some 'standar' keywords refering to specific countries (geograpy could not identify them) country = recognizeSpecificCountries(user_loc) #print(country) if country: return country #find country using geograpy places = geograpy.get_place_context(text=user_loc) #print(places) #print(places.countries) if not places.countries: #print('No country found\n') return None #geograpy returns a list with all possible counries, we take the first one !!!!! #If the input text contains both city and country, then the first element # of "countries" list is always the country contained in text # e.g. Input: 'London' # Output: countries=['United Kingdom', 'United States', 'Canada'] # Input: 'London, Canada' # Output: countries=['Canada', 'Spain', 'United Kingdom', 'United States'] return places.countries[0]
def map(): text = request.form['text'] places = geograpy.get_place_context(text=text) places = places.regions gelocator = Nominatim(user_agent=app.config['GOOGLE_MAP_API']) lat_lon = [] for place in places: try: location = gelocator.geocode(place) if location: lat_lon.append([location.latitude, location.longitude]) except GeocoderTimedOut: continue # something = request.form['map'] return render_template('map.html', lat_lon = lat_lon)
def findLocFromURL(urlList): """ extract location info directly from a url :param urlList: list of filtered urls :return: location names """ print('start extract location from url') findLoc = [] for url in urlList: print(url[0]) places = geograpy.get_place_context(url=url[1]) addr = places.address_strings print(addr) if len(addr) > 0: findLoc.append((url[0], addr)) return findLoc
def testProceedingsExample(self): ''' test a proceedings title Example ''' examples = [ '''Proceedings of the IEEE 14th International Conference on Semantic Computing, ICSC 2020, San Diego, CA, USA, February 3-5, 2020''' ] for example in examples: places = geograpy.get_place_context(text=example) print(places) city = geograpy.locateCity(example, debug=False) print(city)
def name_reg(text): """Return a triplet of recognized entities.""" countries = Counter() regions = Counter() cities = Counter() if text: places = geograpy.get_place_context(text=text) if places.country_mentions: countries.update(unpack_fd(places.country_mentions)) if places.region_mentions: regions.update(unpack_fd(places.region_mentions)) if places.city_mentions: cities.update(unpack_fd(places.city_mentions)) return (countries, regions, cities)
def query_crawled_index(request, core_name, indexed_path): ''' To query crawled data that has been indexed into Solr or Elastichsearch and return location names ''' if "solr" in indexed_path.lower(): if IndexFile(core_name, indexed_path.lower()): location_names = [] points = [] query_range = 500 try: url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path) response = urllib2.urlopen(url) numFound = eval(response.read())['response']['numFound'] for row in range(0, int(numFound), query_range): query_url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format( indexed_path, row, row + query_range) places = geograpy.get_place_context(url=query_url) location_names.append(places.regions) location_names.append(places.countries) location_names.append(places.cities) location_names.append(places.other) location_names = flatten(location_names) print "Found {0} Locations for {1}".format( len(location_names), indexed_path) print "Finding coordinates.." for location in location_names: try: geolocation = geolocator.geocode(location) points.append({ 'loc_name': "{0}".format(location), 'position': { 'x': geolocation.longitude, 'y': geolocation.latitude } }) except: pass print "Found {0} coordinates..".format(len(points)) status = IndexCrawledPoints(core_name, indexed_path.lower(), points) return HttpResponse(status=200, content=status) except Exception as e: return False else: pass
def query_crawled_index(request, core_name, indexed_path): ''' To query crawled data that has been indexed into Solr or Elastichsearch and return location names ''' if "solr" in indexed_path.lower(): if IndexFile(core_name, indexed_path.lower()): location_names = [] points = [] query_range = 500 try: url = "{0}/select?q=*%3A*&wt=json&rows=1".format(indexed_path) response = urllib2.urlopen(url) numFound = eval(response.read())['response']['numFound'] for row in range(0, int(numFound), query_range): query_url = "{0}/select?q=*%3A*&start={1}&rows={2}&wt=json".format(indexed_path, row, row+query_range) places = geograpy.get_place_context(url=query_url) location_names.append(places.regions) location_names.append(places.countries) location_names.append(places.cities) location_names.append(places.other) location_names = flatten(location_names) print "Found {0} Locations for {1}".format(len(location_names), indexed_path) print "Finding coordinates.." for location in location_names: try: geolocation = geolocator.geocode(location) points.append( {'loc_name': "{0}".format(location), 'position':{ 'x': geolocation.longitude, 'y': geolocation.latitude } } ) except: pass print "Found {0} coordinates..".format(len(points)) status = IndexCrawledPoints(core_name, indexed_path.lower(), points) return HttpResponse(status=200, content=status) except Exception as e: return False else: pass
def run(data): if 'coords' in data['geo']: return data if 'fromURL' in data and data['source'] in ['gdelt']: kwargs = { 'url': data['fromURL'] } else: if 'contentEnglish' in data: field = 'contentEnglish' else: field = 'content' kwargs = { 'text': data[field] } try: pc = geograpy.get_place_context(**kwargs) except Exception, e: return data
def extract_venue(title): places = geograpy.get_place_context(text=title).cities if places: return ','.join(places) else: return None
def read_url( (url, date) ): ##accepting first parameter as tuple, because pool() does not accept multiple arguments for functions #connect to MySQL database on AWS db = MySQLDatabase('gdelt', user='******', passwd='***********') #model class: database table named crawler #field instance: creating columns class Crawler(peewee.Model): Country = peewee.CharField() Title = peewee.TextField() Websiteurl = peewee.TextField() Date = peewee.DateField() Keyword1 = peewee.CharField() Keyword2 = peewee.CharField() Keyword3 = peewee.CharField() class Meta: database = db db.connect() db.create_tables( [Crawler], True ) # runs SQL CREATE TABLE statement (only has to be run once). peewee will first check if table has already been created ## identifying associated country mentioned_country = str('NA') try: places = geograpy.get_place_context(url=url) mentioned_country = places.countries[0].encode('utf-8') except: pass ##identifying title and associated keywords mentioned_title = str('NA') mentioned_keyword1 = str('NA') mentioned_keyword2 = str('NA') mentioned_keyword3 = str('NA') # try: # article = Article(url) # article.download() # article.parse() # except Exception: # pass # try: # mentioned_title = article.title # except Exception: # pass # try: # article.nlp() # keywords = article.keywords() # except Exception: # pass # try: # mentioned_keyword1 = keywords[0] # except Exception: # pass # try: # mentioned_keyword2 = keywords[1] # except Exception: # pass # try: # mentioned_keyword3 = keywords[2] # except Exception: # pass ##inserting into SQL if (mentioned_country != 'NA'): Crawler.create(Country=mentioned_country, Title=mentioned_title, Websiteurl=url, Date=date, Keyword1=mentioned_keyword1, Keyword2=mentioned_keyword2, Keyword3=mentioned_keyword3) db.close() return
def getGeoGraphy(text): places = geograpy.get_place_context(text=text) return places.countries, places.regions, places.cities
def run(self): context = geograpy.get_place_context(text=self.text) self.places = context.places self.people = context.people self.organs = context.organs
nltk.downloader.download('words') nltk.downloader.download('treebank') nltk.downloader.download('maxent_treebank_pos_tagger') nltk.downloader.download('punkt') # # nltk.download('punkt') # nltk.download('averaged_perceptron_tagger') # nltk.download('maxent_ne_chunker') # nltk.download('words') # sentence = "I am from Kadawatha" # places = GeoText(sentence) # print (places.cities) text = "Kadawatha Opposition Leader Mahinda Rajapaksa says that the whole public administration has collapsed due to the constitution council’s arbitrary actions. " \ "The Opposition Leader said so in response to a query a journalised raised after a meeting held in Malabe and Meegamuwa" places = geograpy.get_place_context(text=text) print(places.places) # url = 'http://www.bbc.com/news/world-europe-26919928' # places = geograpy.get_place_context(url=url) # print(places.cities) print("****************************************************") # # text6 = u"""Opposition Leader Mahinda Rajapaksa says that the whole public administration has collapsed due to the constitution council’s arbitrary actions. # The Opposition Leader said so in response to a query a journalised raised after a meeting held..""" # e6 = Extractor(text=text6) # e6.find_entities() #print(e6.places) # print("****************************************************")
division = 'Minimumweight' divisionquery = 'Update BoxerData set Division = {2}{0}{2} where BoxerId = {1}'.format( division, boxerId, singleQuote) updateCursor.execute(divisionquery) updateCursor.commit() except: print('no division') #write nationality try: country = birthDate[ 1] #.encode(encoding='utf_16',errors='strict') country = str(country) country = country.lstrip() print(country) places = geograpy.get_place_context( text=country) nationalityquery = 'Update BoxerData set Nationality = {2}{0}{2} where BoxerId = {1}'.format( places.countries[1], boxerId, singleQuote) updateCursor.execute(nationalityquery) updateCursor.commit() except: print('no nationality') #write gender #write stance try: stancequery = 'Update BoxerData set Stance = {2}{0}{2} where BoxerId = {1}'.format( stance, boxerId, singleQuote) updateCursor.execute(stancequery) updateCursor.commit() except:
import csv import geograpy h_file = open('hurricanes.csv') o_file = open('computed_areas_geograpy.csv', 'wb') hurricanes = csv.reader(h_file) # Create Writer Object wr = csv.writer(o_file, dialect='excel') for row in hurricanes: h_uri = str(row[0]) h_abstract = str(row[1]).decode('utf-8', 'ignore') places = geograpy.get_place_context(text=h_abstract) countries = list(set(places.countries)) regions = list(set(places.regions)) #print('Row #' + str(hurricanes.line_num) + ' ' + str(places.countries) + ' ' + str(places.regions) + ' ' + str(places.cities) + ' ' + str(places.other)) # print('Row #' + str(hurricanes.line_num) + ' ' + str(countries) + ' ' + str(regions)) if len(countries) != 0 : print('Countries: ') for country in countries: print(country) wr.writerow([h_uri, country]) if len(regions) != 0 : print('Regions: ')
for post in feedParsed.entries: feedTitle.append(post.title) feedContent.append(post.summary) print("feed " + str(entityCount) + " : " + post.title) entityCount = entityCount + 1 # places = geograpy.get_place_context(text=feedList[1]) # placesInFeed.append(places.places) # print("places - %s" % [str(x) for x in placesInFeed]) print("Processing....") for content in feedContent: if content != "": place = geograpy.get_place_context(text=content) placesInFeed.append(place.places) else: placesInFeed.append("null") k = 1 for place in placesInFeed: print("place " + str(k) + " - %s" % str(place)) k = k + 1 print( "############################################################################################################" ) print( "############################################################################################################"
project_long_description = brazil_df['project_long_description'].values.tolist() project_all = zip(project_title, project_long_description) #for cell_title in brazil_df['project_title'] and cell_long in brazil_df['project_long_description']: #for cell in brazil_df['project_title'],brazil_df['project_long_description']: for cell in project_all: #print cell_title #print cell_long #cell = cell_title + cell_long #print cell #cell = ", ".join(cell) #print cell try: if not pd.isnull(cell[0]): placesInCell1 = geograpy.get_place_context(text=cell[0]).countries else: placesInCell1 = [] if not pd.isnull(cell[1]): placesInCell2 = geograpy.get_place_context(text=cell[1]).countries else: placesInCell2 = [] placesInCell = placesInCell1 + placesInCell2 if placesInCell: if "United States" in placesInCell: if not pd.isnull(cell[0]): if ((" US " not in cell[0]) and (" USA " not in cell[0]) and (" United States Of America " not in cell[0]) and ("United States" not in cell[0]) ): if not pd.isnull(cell[1]):
tweet = re.sub(r'[' + string.punctuation + ']+', ' ', tweet) # Remove punctuations twtok = TweetTokenizer(strip_handles=True, reduce_len=True) tokens = twtok.tokenize(tweet) tokens = [ i.lower() for i in tokens if i not in stopwords and len(i) > 2 and i in english_vocab ] return tokens words = [] places = [] #for tw in tweets_text: for tw in tweets_text[:1000]: words += process_tweets_texts(tw) places.append(geo.get_place_context(text=tw)) for tw in tweets_text[1000:2000]: words += process_tweets_texts(tw) places.append(geo.get_place_context(text=tw)) city = [] for p in range(len(places)): pl = places[p].cities for i in pl: if i in cities: city.append(i) print(city)
def extract_city(text): city = geograpy.get_place_context(text=text) real_city = city.cities return real_city
api_url = 'http://92.62.139.201:8080/api/geonames/countries' print("Opening input file '%s'..." % input_file_path) with open(input_file_path, 'r', encoding='utf-8') as file: text = file.read().replace('\n', ' ') chunk_size = 50000 chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)] output = {} results = [] print("Searching for geo names in '%s' language..." % language_code) for chunk in chunks: places = get_place_context(text=chunk).countries temp_results = {} for place in places: if place[0].isupper() and place not in results and place in chunk: geo_names = get(api_url, { 'name': place, 'isolanguage': language_code }).json() if len(geo_names) > 0: pos = chunk.find(place) output[pos] = geo_names temp_results[pos] = place results.append(place)
def extract_country(affiliation): places = geograpy.get_place_context(text=affiliation) try: return places.country_mentions[0][0] except IndexError, e: return ""
def cityDic(places): geolocator = Nominatim(user_agent="specify_your_app_name_here") geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1) place_dicts = [] for place in places: place_dict = {"text":place, "address":"", "latitude":"", "longtitude":""} location = geocode(place) if location: place_dict["address"] = location.address point = tuple(location.point) place_dict["latitude"] = point[0] place_dict["longtitude"] = point[1] place_dicts.append(place_dict) return place_dicts if __name__ == '__main__': args = get_args() data = load_from_json(args.data) place_tags = [] # TODO : Process only sentences with label 1 for sentence in data["sentences"]: places = geograpy.get_place_context(text=sentence) place_dicts = cityDic(places.cities) # Only cities ??? place_tags.append(place_dicts) data["place_tags"] = place_tags write_to_json(data, data["id"], extension="json", out_dir=args.out_dir)
def extract_location_from_text(text): #https://stackoverflow.com/questions/40517720/python-geograpy-unable-to-run-demo places = geograpy.get_place_context(text=text) return places.country_mentions
import folium from folium.plugins import Fullscreen m = folium.Map(location=[10, 0], zoom_start=2.1) Fullscreen().add_to(m) m.save(outfile='fullscreen.html') import geograpy url = 'http://www.bbc.com/news/world-europe-26919928' places = geograpy.get_place_context(url=url) folium.Marker([45.3288, -121.6625], popup='Mt. Hood Meadows').add_to(m)
import geograpy from geograpy import extraction from geopy.geocoders import Nominatim from geopy.exc import GeocoderTimedOut places = geograpy.get_place_context(text="The National Air and Space Museum of the Smithsonian Institution, also called the Air and Space Museum, is a museum ‘in Washington, D.C. It was established in 1946 as the National Air Museum and opened its main building on the National Mail near L’Enfant Plaza in 1976. In 2018, the museum saw approximately 6.2 million visitors, making it the fifth most visited _ museum in the world, and the second most visited museum in the United States.!°] The museum contains the Apollo 11 _ Command Module Columbia, the Friendship 7 capsule which was flown by John Glenn, Charles Lindbergh's Spirit of St. -ouis, the Bell X-1 which broke the sound barrier, the model of the starship Enterprise used in the science fiction television jow Star Trek: The Original Series, and the Wright brothers' Wright Flyer airplane near the entrance.") places = places.regions gelocator = Nominatim(user_agent='google_api') lat_lon = [] for place in places: try: location = gelocator.geocode(place) if location: lat_lon.append([location.latitude, location.longitude]) except GeocoderTimedOut: continue print(places)