Ejemplo n.º 1
0
class BasicCliffTest(unittest.TestCase):
    # A basic set of test cases to make sure the API can pull from the server correctly.

    def setUp(self):
        self._url = os.getenv("CLIFF_URL")
        self._cliff = Cliff(self._url)

    def test_parse_text(self):
        results = self._cliff.parse_text(
            "This is about Einstien at the IIT in New Delhi.")
        results = results['results']
        print(results)
        self.assertEqual(len(results['organizations']), 1)
        self.assertEqual(len(results['places']['mentions']), 1)
        self.assertEqual(results['places']['mentions'][0]['id'], 1261481)
        self.assertEqual(len(results['people']), 1)

    def test_extract_content(self):
        test_url = "https://www.foxnews.com/us/temple-university-stands-by-marc-lamont-hill-after-cnn-fires-him-for-anti-israel-remarks"
        results = self._cliff.extract_content(test_url)
        results = results['results']
        self.assertEqual(test_url, results['url'])
        self.assertTrue(len(results['text']) > 100)

    def test_geonames_lookup(self):
        results = self._cliff.geonames_lookup(4943351)
        self.assertEqual(results['id'], 4943351)
        self.assertEqual(results['lon'], -71.09172)
        self.assertEqual(results['lat'], 42.35954)
        self.assertEqual(results['name'],
                         "Massachusetts Institute of Technology")
        self.assertEqual(results['parent']['name'], "City of Cambridge")
        self.assertEqual(results['parent']['parent']['name'],
                         "Middlesex County")
        self.assertEqual(results['parent']['parent']['parent']['name'],
                         "Massachusetts")
        self.assertEqual(
            results['parent']['parent']['parent']['parent']['name'],
            "United States")

    def test_local_replacements(self):
        replacements = {
            'Londonderry': 'London',
        }
        # make sure non-replaced fetches the city in the UK
        results = self._cliff.parse_text("This is about London.")['results']
        mention = results['places']['mentions'][0]
        self.assertEqual(GEONAME_LONDON_UK, mention['id'])
        # now see if it gets the city with replacements
        replacing_cliff = Cliff(self._url, text_replacements=replacements)
        results = replacing_cliff.parse_text(
            "This is about London.")['results']
        replaced_mention = results['places']['mentions'][0]
        self.assertEqual(GEONAME_LONDERRY_NH, replaced_mention['id'])
Ejemplo n.º 2
0
 def test_local_replacements(self):
     replacements = {
         'Londonderry': 'London',
     }
     # make sure non-replaced fetches the city in the UK
     results = self._cliff.parse_text("This is about London.")['results']
     mention = results['places']['mentions'][0]
     self.assertEqual(GEONAME_LONDON_UK, mention['id'])
     # now see if it gets the city with replacements
     replacing_cliff = Cliff(self._url, text_replacements=replacements)
     results = replacing_cliff.parse_text(
         "This is about London.")['results']
     replaced_mention = results['places']['mentions'][0]
     self.assertEqual(GEONAME_LONDERRY_NH, replaced_mention['id'])
Ejemplo n.º 3
0
    def clavin(self):
        my_cliff = Cliff('http://localhost:8080')
        dictionary = {}
        while True:
            try:
                dictionary = my_cliff.parse_text(self.body_page)
                break

            except:
                print("Clavin Docker not running or link not valid", '\n')
                logging.error("Clavin Docker not running or link not valid")
                break

        json_object = json.dumps(dictionary, indent=4)

        with open("clavin.json", "w") as outfile:
            outfile.write(json_object)
            logging.info("Clavin JSON file written")

        with open('clavin.json') as fi:
            # with open('sample.json') as fi:
            self.d = json.load(fi)
            if not self.d:
                logging.error("Clavin JSON File Empty")
Ejemplo n.º 4
0
my_cliff = Cliff('http://localhost:8080')

file_name = "../processedData/messages.xlsx"  # path to file + file name
sheet = "Sheet1"  # sheet name or sheet number or list of sheet numbers and names

df = pd.read_excel(io=file_name, sheet_name=sheet)
excel_data = []
check_repeat = []
for index, row in df.iterrows():
    parsed_row = re.split('[?.:]', row['message'])
    for sentence in parsed_row:
        if (len(sentence.split()) < 4 and len(sentence.strip()) > 2):
            if (sentence.strip() not in check_repeat):
                temp_data = {}
                check_repeat.append(sentence.strip())
                result = my_cliff.parse_text(sentence)
                try:
                    targets = result['results']['places']['focus']
                    if targets != {}:
                        # message, author
                        temp_data['author'] = row['author']
                        temp_data['message'] = sentence.strip()
                        # city data
                        temp_data['cities'] = []
                        if targets['cities'] != []:
                            for city in targets['cities']:
                                temp_data['cities'].append(
                                    (city['name'], city['lat'], city['lon']))
                        #state data
                        temp_data['states'] = []
                        if targets['states'] != []:
Ejemplo n.º 5
0
def extract_locaiton_info(text):
    my_cliff = Cliff(cliff_server_addr)
    print(my_cliff.parse_text(text))
    print(my_cliff.geonames_lookup(4943351))
Ejemplo n.º 6
0
# result object to append to
result = []

# index for abstract object
abstract = scraped_abstracts['abstract']

# index for title object
EID = scraped_abstracts['EID']

# loop through abstracts
for i in range(0, len(abstract)):

    try:

        # run cliff on text at localhost
        this = my_cliff.parse_text(abstract.iloc[i])

        # extract for required part of json for 'mentions'
        this_2 = this['results']
        this_3 = this_2['places']
        this_4 = this_3['mentions']

        # convert json to dataframe
        df = json_normalize(this_4)

        # extract for required part of json for 'focus'
        this_5 = this_3['focus']
        this_6 = this_5['countries']

        # convert focus to dataframe
        this_7 = json_normalize(this_6)