Beispiel #1
0
class BasicCliffTest(unittest.TestCase):
    # A basic set of test cases to make sure the API can pull from the server correctly.

    def setUp(self):
        self._url = os.getenv("CLIFF_URL")
        self._cliff = Cliff(self._url)

    def test_parse_text(self):
        results = self._cliff.parse_text(
            "This is about Einstien at the IIT in New Delhi.")
        results = results['results']
        print(results)
        self.assertEqual(len(results['organizations']), 1)
        self.assertEqual(len(results['places']['mentions']), 1)
        self.assertEqual(results['places']['mentions'][0]['id'], 1261481)
        self.assertEqual(len(results['people']), 1)

    def test_extract_content(self):
        test_url = "https://www.foxnews.com/us/temple-university-stands-by-marc-lamont-hill-after-cnn-fires-him-for-anti-israel-remarks"
        results = self._cliff.extract_content(test_url)
        results = results['results']
        self.assertEqual(test_url, results['url'])
        self.assertTrue(len(results['text']) > 100)

    def test_geonames_lookup(self):
        results = self._cliff.geonames_lookup(4943351)
        self.assertEqual(results['id'], 4943351)
        self.assertEqual(results['lon'], -71.09172)
        self.assertEqual(results['lat'], 42.35954)
        self.assertEqual(results['name'],
                         "Massachusetts Institute of Technology")
        self.assertEqual(results['parent']['name'], "City of Cambridge")
        self.assertEqual(results['parent']['parent']['name'],
                         "Middlesex County")
        self.assertEqual(results['parent']['parent']['parent']['name'],
                         "Massachusetts")
        self.assertEqual(
            results['parent']['parent']['parent']['parent']['name'],
            "United States")

    def test_local_replacements(self):
        replacements = {
            'Londonderry': 'London',
        }
        # make sure non-replaced fetches the city in the UK
        results = self._cliff.parse_text("This is about London.")['results']
        mention = results['places']['mentions'][0]
        self.assertEqual(GEONAME_LONDON_UK, mention['id'])
        # now see if it gets the city with replacements
        replacing_cliff = Cliff(self._url, text_replacements=replacements)
        results = replacing_cliff.parse_text(
            "This is about London.")['results']
        replaced_mention = results['places']['mentions'][0]
        self.assertEqual(GEONAME_LONDERRY_NH, replaced_mention['id'])
Beispiel #2
0
 def test_local_replacements(self):
     replacements = {
         'Londonderry': 'London',
     }
     # make sure non-replaced fetches the city in the UK
     results = self._cliff.parse_text("This is about London.")['results']
     mention = results['places']['mentions'][0]
     self.assertEqual(GEONAME_LONDON_UK, mention['id'])
     # now see if it gets the city with replacements
     replacing_cliff = Cliff(self._url, text_replacements=replacements)
     results = replacing_cliff.parse_text(
         "This is about London.")['results']
     replaced_mention = results['places']['mentions'][0]
     self.assertEqual(GEONAME_LONDERRY_NH, replaced_mention['id'])
Beispiel #3
0
    def clavin(self):
        my_cliff = Cliff('http://localhost:8080')
        dictionary = {}
        while True:
            try:
                dictionary = my_cliff.parse_text(self.body_page)
                break

            except:
                print("Clavin Docker not running or link not valid", '\n')
                logging.error("Clavin Docker not running or link not valid")
                break

        json_object = json.dumps(dictionary, indent=4)

        with open("clavin.json", "w") as outfile:
            outfile.write(json_object)
            logging.info("Clavin JSON file written")

        with open('clavin.json') as fi:
            # with open('sample.json') as fi:
            self.d = json.load(fi)
            if not self.d:
                logging.error("Clavin JSON File Empty")
Beispiel #4
0
    handler = SentryHandler(config.get('SENTRY_DSN'))
    handler.setLevel(logging.ERROR)
    setup_logging(handler)
except ConfigException as e:
    logger.info("no sentry logging")

# Connect to MediaCloud
TOOL_API_KEY = config.get('MEDIA_CLOUD_API_KEY')

mc = mediacloud.api.AdminMediaCloud(TOOL_API_KEY)
logger.info("Connected to mediacloud")

# Connect to CLIFF if the settings are there
cliff = None
try:
    cliff = Cliff(config.get('CLIFF_URL'))
except KeyError as e:
    logger.warning("no CLIFF connection")

NYT_THEME_LABELLER_URL = config.get('NYT_THEME_LABELLER_URL')

# Connect to the app's mongo DB
try:
    user_db = UserDatabase(config.get('MONGO_URL'))
    analytics_db = AnalyticsDatabase(config.get('MONGO_URL'))
    user_db.check_connection()
    logger.info("Connected to DB: {}".format(config.get('MONGO_URL')))
except Exception as err:
    logger.error("DB error: {0}".format(err))
    logger.exception(err)
    sys.exit()
Beispiel #5
0
# Step 2: takes untagged messages and splits them by sentence and then looks for location identifiers. put all statements w locations into new sheet.

from cliff.api import Cliff
import pandas as pd
import geoip2.database
import re

reader = geoip2.database.Reader("../GeoLite2-City_20210202/GeoLite2-City.mmdb")

my_cliff = Cliff('http://localhost:8080')

file_name = "../processedData/messages.xlsx"  # path to file + file name
sheet = "Sheet1"  # sheet name or sheet number or list of sheet numbers and names

df = pd.read_excel(io=file_name, sheet_name=sheet)
excel_data = []
check_repeat = []
for index, row in df.iterrows():
    parsed_row = re.split('[?.:]', row['message'])
    for sentence in parsed_row:
        if (len(sentence.split()) < 4 and len(sentence.strip()) > 2):
            if (sentence.strip() not in check_repeat):
                temp_data = {}
                check_repeat.append(sentence.strip())
                result = my_cliff.parse_text(sentence)
                try:
                    targets = result['results']['places']['focus']
                    if targets != {}:
                        # message, author
                        temp_data['author'] = row['author']
                        temp_data['message'] = sentence.strip()
Beispiel #6
0
def extract_locaiton_info(text):
    my_cliff = Cliff(cliff_server_addr)
    print(my_cliff.parse_text(text))
    print(my_cliff.geonames_lookup(4943351))
Beispiel #7
0
def get_cliff_client():
    return Cliff(CLIFF_URL)
Beispiel #8
0
 def setUp(self):
     self._url = os.getenv("CLIFF_URL")
     self._cliff = Cliff(self._url)
from cliff.api import Cliff
import json
from pprint import pprint

my_cliff = Cliff("http://10.176.148.84:8080")


def extract_location(tweet):
    location = {}
    focus = my_cliff.parse_text(tweet)['results']['places']['focus']

    for key, value in focus.items():
        location[key] = [item['name'] for item in value]

    return location


def readFile(fileName):
    with open(fileName, 'r') as f:
        d = json.load(f)
    f.close()
    return d


def writeFile(fileName, data):
    with open(fileName, 'w') as f:
        json.dump(data, f)
    f.close()


if __name__ == '__main__':
Beispiel #10
0
MAX_CHARS = 250  # limit the amount of text users can send in

app = Flask(__name__)

# setup logging
logging.basicConfig(level=logging.WARN)
log = logging.getLogger(__file__)
log.info(
    "---------------------------------------------------------------------------"
)

app_config = config.get_default_config()

# set up the api client we will use
CLIFF_URL = app_config.get('CLIFF_URL')
cliff = Cliff(CLIFF_URL)
cliff.PARSE_TEXT_PATH = "/cliff/parse/text"  # instead of "/cliff-2.6.1/parse/text"


# render the homepage
@app.route("/")
def index():
    return render_template('home.html', version=VERSION)


# return json results from CLIFF
@app.route("/process", methods=['POST'])
def geoparse():
    text = request.form['text']
    language = request.form['language']
    demonyms = request.form['demonyms'] == 'true'
Beispiel #11
0
logging.basicConfig()

# import required modules
from cliff.api import Cliff
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

# read in abstracts
scraped_abstracts = pd.read_csv(
    'C:/Users/joeym/Documents/PhD/Aims/Aim 1 - collate pollinator knowledge/Outputs/scrape_abs/cleaned/for_geoparse/04_animal-species_abs_1-2-cleaned-for-geoparse.csv'
)

# assign the localhost address to my_cliff
my_cliff = Cliff('http://localhost:8999')

# result object to append to
result = []

# index for abstract object
abstract = scraped_abstracts['abstract']

# index for title object
EID = scraped_abstracts['EID']

# loop through abstracts
for i in range(0, len(abstract)):

    try:
Beispiel #12
0
logging.basicConfig(
    filename=os.path.join(base_dir, 'worker.log'),
    level=logging.INFO,
    format='%(asctime)s | %(levelname)s | %(name)s | %(message)s')
logger = logging.getLogger(__name__)
logger.info(
    "------------------------------------------------------------------------")
logger.info("Starting up Geocoding Worker v{}".format(VERSION))

config = get_default_config()

BROKER_URL = config.get('BROKER_URL')
logger.info("BROKER_URL: {}".format(BROKER_URL))

MC_API_KEY = config.get('MC_API_KEY')
mc = mediacloud.api.AdminMediaCloud(MC_API_KEY)
logger.info("MC_API_KEY: {}".format(MC_API_KEY))

CLIFF_URL = config.get('CLIFF_URL')
cliff = Cliff(CLIFF_URL)
logger.info("CLIFF_URL: {}".format(CLIFF_URL))

try:
    SENTRY_DSN = config.get('SENTRY_DSN')
    logger.info("SENTRY_DSN: {}".format(SENTRY_DSN))
    handler = SentryHandler(SENTRY_DSN)
    handler.setLevel(logging.WARN)
    setup_logging(handler)
except ConfigException:
    logger.info("No logging to sentry")