class BasicCliffTest(unittest.TestCase): # A basic set of test cases to make sure the API can pull from the server correctly. def setUp(self): self._url = os.getenv("CLIFF_URL") self._cliff = Cliff(self._url) def test_parse_text(self): results = self._cliff.parse_text( "This is about Einstien at the IIT in New Delhi.") results = results['results'] print(results) self.assertEqual(len(results['organizations']), 1) self.assertEqual(len(results['places']['mentions']), 1) self.assertEqual(results['places']['mentions'][0]['id'], 1261481) self.assertEqual(len(results['people']), 1) def test_extract_content(self): test_url = "https://www.foxnews.com/us/temple-university-stands-by-marc-lamont-hill-after-cnn-fires-him-for-anti-israel-remarks" results = self._cliff.extract_content(test_url) results = results['results'] self.assertEqual(test_url, results['url']) self.assertTrue(len(results['text']) > 100) def test_geonames_lookup(self): results = self._cliff.geonames_lookup(4943351) self.assertEqual(results['id'], 4943351) self.assertEqual(results['lon'], -71.09172) self.assertEqual(results['lat'], 42.35954) self.assertEqual(results['name'], "Massachusetts Institute of Technology") self.assertEqual(results['parent']['name'], "City of Cambridge") self.assertEqual(results['parent']['parent']['name'], "Middlesex County") self.assertEqual(results['parent']['parent']['parent']['name'], "Massachusetts") self.assertEqual( results['parent']['parent']['parent']['parent']['name'], "United States") def test_local_replacements(self): replacements = { 'Londonderry': 'London', } # make sure non-replaced fetches the city in the UK results = self._cliff.parse_text("This is about London.")['results'] mention = results['places']['mentions'][0] self.assertEqual(GEONAME_LONDON_UK, mention['id']) # now see if it gets the city with replacements replacing_cliff = Cliff(self._url, text_replacements=replacements) results = replacing_cliff.parse_text( "This is about London.")['results'] replaced_mention = results['places']['mentions'][0] self.assertEqual(GEONAME_LONDERRY_NH, replaced_mention['id'])
def test_local_replacements(self): replacements = { 'Londonderry': 'London', } # make sure non-replaced fetches the city in the UK results = self._cliff.parse_text("This is about London.")['results'] mention = results['places']['mentions'][0] self.assertEqual(GEONAME_LONDON_UK, mention['id']) # now see if it gets the city with replacements replacing_cliff = Cliff(self._url, text_replacements=replacements) results = replacing_cliff.parse_text( "This is about London.")['results'] replaced_mention = results['places']['mentions'][0] self.assertEqual(GEONAME_LONDERRY_NH, replaced_mention['id'])
def clavin(self): my_cliff = Cliff('http://localhost:8080') dictionary = {} while True: try: dictionary = my_cliff.parse_text(self.body_page) break except: print("Clavin Docker not running or link not valid", '\n') logging.error("Clavin Docker not running or link not valid") break json_object = json.dumps(dictionary, indent=4) with open("clavin.json", "w") as outfile: outfile.write(json_object) logging.info("Clavin JSON file written") with open('clavin.json') as fi: # with open('sample.json') as fi: self.d = json.load(fi) if not self.d: logging.error("Clavin JSON File Empty")
my_cliff = Cliff('http://localhost:8080') file_name = "../processedData/messages.xlsx" # path to file + file name sheet = "Sheet1" # sheet name or sheet number or list of sheet numbers and names df = pd.read_excel(io=file_name, sheet_name=sheet) excel_data = [] check_repeat = [] for index, row in df.iterrows(): parsed_row = re.split('[?.:]', row['message']) for sentence in parsed_row: if (len(sentence.split()) < 4 and len(sentence.strip()) > 2): if (sentence.strip() not in check_repeat): temp_data = {} check_repeat.append(sentence.strip()) result = my_cliff.parse_text(sentence) try: targets = result['results']['places']['focus'] if targets != {}: # message, author temp_data['author'] = row['author'] temp_data['message'] = sentence.strip() # city data temp_data['cities'] = [] if targets['cities'] != []: for city in targets['cities']: temp_data['cities'].append( (city['name'], city['lat'], city['lon'])) #state data temp_data['states'] = [] if targets['states'] != []:
def extract_locaiton_info(text): my_cliff = Cliff(cliff_server_addr) print(my_cliff.parse_text(text)) print(my_cliff.geonames_lookup(4943351))
# result object to append to result = [] # index for abstract object abstract = scraped_abstracts['abstract'] # index for title object EID = scraped_abstracts['EID'] # loop through abstracts for i in range(0, len(abstract)): try: # run cliff on text at localhost this = my_cliff.parse_text(abstract.iloc[i]) # extract for required part of json for 'mentions' this_2 = this['results'] this_3 = this_2['places'] this_4 = this_3['mentions'] # convert json to dataframe df = json_normalize(this_4) # extract for required part of json for 'focus' this_5 = this_3['focus'] this_6 = this_5['countries'] # convert focus to dataframe this_7 = json_normalize(this_6)