def testExtractorFromUrl(self): ''' test the extractor ''' url = 'https://en.wikipedia.org/wiki/Louvre' e = Extractor(url=url) e.find_geoEntities() self.check(e.places, ['Paris', 'France'])
def testStackOverflow54721435(self): ''' see https://stackoverflow.com/questions/54721435/unable-to-extract-city-names-from-a-text-using-geograpypython ''' text = 'I live in Kadawatha a suburb of Colombo Sri Lanka' e = Extractor(text=text) e.find_entities() print(e.places)
def testStackoverflow54077973(self): ''' see https://stackoverflow.com/questions/54077973/geograpy3-library-for-extracting-the-locations-in-the-text-gives-unicodedecodee ''' address = 'Jersey City New Jersey 07306' e = Extractor(text=address) e.find_entities() self.check(e.places, ['Jersey', 'City'])
def testExtractorFromUrl(self): ''' test the extractor ''' url = 'http://www.bbc.com/news/world-europe-26919928' e = Extractor(url=url) e.find_geoEntities() self.check(e.places, ['Russia', 'Kiev', 'Ukraine'])
def testStackoverflow43322567(self): ''' see https://stackoverflow.com/questions/43322567 ''' url = 'https://en.wikipedia.org/wiki/U.S._state' e = Extractor(url=url) places = e.find_geoEntities() self.check(places, ['Alabama', 'Virginia', 'New York']) print(places)
def testStackoverflow54712198(self): ''' see https://stackoverflow.com/questions/54712198/not-only-extracting-places-from-a-text-but-also-other-names-in-geograpypython ''' text = '''Opposition Leader Mahinda Rajapaksa says that the whole public administration has collapsed due to the constitution council’s arbitrary actions. The Opposition Leader said so in response to a query a journalised raised after a meeting held...''' e = Extractor(text) places = e.find_geoEntities() if self.debug: print(places) self.assertEqual([], places)
def get_place_context(url=None, text=None): e = Extractor(url=url, text=text) e.find_entities() pc = PlaceContext(e.places) pc.set_countries() pc.set_regions() pc.set_cities() pc.set_other() return pc
def locateCity(location, correctMisspelling=False, debug=False): ''' locate the given location string Args: location(string): the description of the location Returns: Locator: the location ''' e = Extractor(text=location, debug=debug) e.split() loc = Locator.getInstance(correctMisspelling=correctMisspelling, debug=debug) city = loc.locateCity(e.places) return city
def testStackoverflow55548116(self): ''' see https://stackoverflow.com/questions/55548116/geograpy3-library-is-not-working-properly-and-give-traceback-error ''' feedContent = ['Las Vegas is a city in Nevada'] placesInFeed = [] for content in feedContent: if content != "": e = Extractor(text=content) e.find_entities() places = e.places if self.debug: print(places) placesInFeed.append(places)
def get_place_context(url=None, text=None): e = Extractor(url=url, text=text) e.find_entities() pc = PlaceContext(e.places, e.people, e.organs) pc.set_countries() pc.set_regions() pc.set_cities() pc.set_other() return pc # url = 'http://www.bbc.com/news/world-us-canada-39821789' # places = get_place_context(url=url) # len(places)
def get_place_context(url=None, text=None, labels=Labels.default, debug=False): ''' Get a place context for a given text with information about country, region, city and other based on NLTK Named Entities in the label set Geographic(GPE), Person(PERSON) and Organization(ORGANIZATION). Args: url(String): the url to read text from (if any) text(String): the text to analyze debug(boolean): if True show debug information Returns: pc: PlaceContext: the place context ''' e = Extractor(url=url, text=text, debug=debug) e.find_entities(labels=labels) pc = PlaceContext(e.places) pc.setAll() return pc
def test(): e = Extractor(url='http://www.bbc.com/news/world-europe-26919928') e.find_entities() assert len(e.places) > 0 assert 'Russia' in e.places assert 'Kiev' in e.places text = """ Perfect just Perfect! It's a perfect storm for Nairobi on a Friday evening! horrible traffic here is your cue to become worse @Ma3Route """ e2 = Extractor(text=text) e2.find_entities() assert len(e2.places) > 0 assert 'Nairobi' in e2.places text3 = """ Risks of Cycling in Nairobi:http://www.globalsiteplans.com/environmental-design/engineering-environmental-design/the-risky-affair-of-cycling-in-nairobi-kenya/ ... via @ConstantCap @KideroEvans @county_nairobi @NrbCity_Traffic """ e3 = Extractor(text=text3) e3.find_entities() assert len(e3.places) > 0 assert 'Nairobi' in e3.places text4 = """ @DurbanSharks [Africa Renewal]It is early morning in Nairobi, the Kenyan capital. The traffic jam along Ngong """ e4 = Extractor(text=text4) e4.find_entities() assert len(e4.places) > 0 assert 'Nairobi' in e4.places assert 'Ngong' in e4.places
def test(): e = Extractor(url='http://www.bbc.com/news/world-europe-26919928') e.find_entities() assert len(e.places) > 0 assert 'Russia' in e.places assert 'Kiev' in e.places text = """ Perfect just Perfect! It's a perfect storm for Nairobi on a Friday evening! horrible traffic here is your cue to become worse @Ma3Route """ e2 = Extractor(text=text) e2.find_entities() assert len(e2.places) > 0 assert 'Nairobi' in e2.places text3 = """ Risks of Cycling in Nairobi:http://www.globalsiteplans.com/environmental-design/engineering-environmental-design/the-risky-affair-of-cycling-in-nairobi-kenya/ ... via @ConstantCap @KideroEvans @county_nairobi @NrbCity_Traffic """ e3 = Extractor(text=text3) e3.find_entities() assert len(e3.places) > 0 assert 'Nairobi' in e3.places text4 = """ @DurbanSharks [Africa Renewal]It is early morning in Nairobi, the Kenyan capital. The traffic jam along Ngong """ e4 = Extractor(text=text4) e4.find_entities() assert len(e4.places) > 0 assert 'Nairobi' in e4.places assert 'Ngong' in e4.places # unicode text5 = u""" There is a city called New York in the United States.""" e5 = Extractor(text=text5) e5.find_entities() print e5.places assert len(e5.places) == 2 assert u'New York' in e5.places assert u'United States' in e5.places # unicode and two words text6 = u""" There is a city called São Paulo in Brazil.""" e6 = Extractor(text=text6) e6.find_entities() print e6.places assert len(e6.places) > 1 assert u'São Paulo' in e6.places
def collect_news(): print('Running collecting news') retrive_news_from_firebase() # check_news_dataset() set_entityCount() url = [ "http://www.adaderana.lk/rss.php", "http://www.hirunews.lk/rss/english.xml", "https://www.news.lk/news?format=feed", "https://srilankamirror.com/news?format=feed&type=rss", "http://www.thesundayleader.lk/feed/", "https://www.newsfirst.lk/feed/" ] for url in url: # print(url) # read the rss feeds from urls feedParsed = feedparser.parse(url) # print(feedParsed) # check whether the rss reading success or not if feedParsed.feed != {}: global news_in_db global entityCount for post in feedParsed.entries: if is_news_already_exist_in_db(post.title) != True: category = classify_news(post.title) newsObj = News(post.title, post.description, post.summary, post.link, category, post.published, entityCount) newsObjects.append(newsObj) locations = Extractor( text=post.description) # Extract location locations.find_entities() # print(locations.places) # locations is an array newsObj.add_locations(locations.places) # data = { # "title": newsObj.title, # "id":entityCount, # "description": newsObj.description, # "summary": newsObj.summery, # "link": newsObj.link, # "category": newsObj.category, # "locations": newsObj.locations, # "date_time": newsObj.date_time # } # firebase.database().set(data) global db doc_ref = db.collection(u'news').document() doc_ref.set({ u'title': newsObj.title, u'news_id': newsObj.news_id, u'description': newsObj.description, u'summary': newsObj.summary, u'link': newsObj.link, u'category': newsObj.category, u'locations': newsObj.locations, u'date_time': newsObj.date_time }) update_news_dataset(newsObj) # db.collection(u'newsAppData').document(u'news').set(newsObj) print("feed " + str(newsObj.news_id) + " : " + str(newsObj.title)) print('category: ', category, '. time ', newsObj.date_time, ' . locations:', newsObj.locations) entityCount = entityCount + 1 else: print('Connection failed with url :', url) WAIT_SECONDS = 100 # timer for thread print(time.ctime()) news_in_db.clear() threading.Timer(WAIT_SECONDS, collect_news).start()
def testExtractorFromText(self): ''' test different texts for getting geo context information ''' text = """ Perfect just Perfect! It's a perfect storm for Nairobi on a Friday evening! horrible traffic here is your cue to become worse @Ma3Route """ e2 = Extractor(text=text) e2.find_entities() self.check(e2.places, ['Nairobi']) text3 = """ Risks of Cycling in Nairobi:http://www.globalsiteplans.com/environmental-design/engineering-environmental-design/the-risky-affair-of-cycling-in-nairobi-kenya/ ... via @ConstantCap @KideroEvans @county_nairobi @NrbCity_Traffic """ e3 = Extractor(text=text3) e3.find_entities() self.check(e3.places, ['Nairobi']) text4 = """ @DurbanSharks [Africa Renewal]It is early morning in Nairobi, the Kenyan capital. The traffic jam along Ngong """ e4 = Extractor(text=text4) e4.find_entities() self.check(e4.places, ['Nairobi', 'Ngong']) # unicode text5 = u""" There is a city called New York in the United States.""" e5 = Extractor(text=text5) e5.find_entities() self.check(e5.places, ['New York', 'United States']) # unicode and two words text6 = u""" There is a city called São Paulo in Brazil.""" e6 = Extractor(text=text6) e6.find_entities() self.check(e6.places, ['São Paulo'])