def main(events, file_details): """ Pulls out a database ID and runs the ``query_geotext`` function to hit the GeoVista Center's GeoText API and find location information within the sentence. Parameters ---------- events: Dictionary. Contains filtered events from the one-a-day filter. Keys are (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of IDs, sources, and issues. Returns ------- events: Dictionary. Same as in the parameter but with the addition of a value that is a tuple of the form (LAT, LON). """ coll = utilities.make_conn(file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: event_id, sentence_id = events[event]['ids'][0].split('_') result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) sents = utilities.sentence_segmenter(result['content']) query_text = sents[int(sentence_id)] lon, lat, name = query_geotext(query_text) if lat and lon: events[event]['geo'] = (lon, lat, name) return events
def print_sentence(events, file_details, server_details, geo_details): """ Returns the sentence instead of calling a geo service Parameters ---------- events: Dictionary. Contains filtered events from the one-a-day filter. Keys are (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of IDs, sources, and issues. Returns ------- events: Dictionary. Same as in the parameter but with the addition of a value that is a list of lon, lat, placeName, stateName, countryCode. """ coll = utilities.make_conn(file_details.db_db, file_details.db_collection, file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: event_id, sentence_id = events[event]['ids'][0].split('_') # print(event_id) result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) sents = utilities.sentence_segmenter(result['content']) events[event]['geosentence'] = sents[int(sentence_id)] events[event]['geo'] = ("$TBD", "$TBD", "$TBD", "$TBD", "$TBD") return events
def cliff(events, file_details, server_details, geo_details): """ Pulls out a database ID and runs the ``query_cliff`` function to hit MIT's CLIFF/CLAVIN geolocation system running locally and find location information within the sentence. Note, this function calls back to the database where stories are stored. Parameters ---------- events: Dictionary. Contains filtered events from the one-a-day filter. Keys are (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of IDs, sources, and issues. Returns ------- events: Dictionary. Same as in the parameter but with the addition of a value that is a list of lon, lat, placeName, stateName, countryCode. """ coll = utilities.make_conn(file_details.db_db, file_details.db_collection, file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: event_id, sentence_id = events[event]['ids'][0].split('_') # print(event_id) result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) sents = utilities.sentence_segmenter(result['content']) query_text = sents[int(sentence_id)] geo_info = query_cliff(query_text, geo_details.cliff_host, geo_details.cliff_port) if geo_info: try: if geo_info['countryCode'] != "": geo_info['countryCode'] = iso_convert( geo_info['countryCode']) except: logger.warning("""Error converting country codes.""") events[event]['geo'] = (geo_info['lon'], geo_info['lat'], geo_info['placeName'], geo_info['stateName'], geo_info['countryCode']) # Add in country and restype here return events
def cliff(events, file_details, server_details, geo_details): """ Pulls out a database ID and runs the ``query_cliff`` function to hit MIT's CLIFF/CLAVIN geolocation system running locally and find location information within the sentence. Note, this function calls back to the database where stories are stored. Parameters ---------- events: Dictionary. Contains filtered events from the one-a-day filter. Keys are (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of IDs, sources, and issues. Returns ------- events: Dictionary. Same as in the parameter but with the addition of a value that is a list of lon, lat, placeName, stateName, countryCode. """ coll = utilities.make_conn(file_details.db_db, file_details.db_collection, file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: event_id, sentence_id = events[event]['ids'][0].split('_') # print(event_id) result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) sents = utilities.sentence_segmenter(result['content']) query_text = sents[int(sentence_id)] geo_info = query_cliff(query_text, geo_details.cliff_host, geo_details.cliff_port) if geo_info: try: if geo_info['countryCode'] != "": geo_info['countryCode'] = iso_convert(geo_info['countryCode']) except: logger.warning("""Error converting country codes.""") events[event]['geo'] = (geo_info['lon'], geo_info['lat'], geo_info['placeName'], geo_info['stateName'], geo_info['countryCode']) # Add in country and restype here return events
def mordecai(events, file_details, server_details, geo_details): """ Pulls out a database ID and queries the Mordecai geolocation system running locally and find location information within the sentence. Parameters ---------- events: Dictionary. Contains filtered events from the one-a-day filter. Keys are (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of IDs, sources, and issues. Returns ------- events: Dictionary. Same as in the parameter but with the addition of a value that is a list of lon, lat, placeName, stateName, countryCode. """ coll = utilities.make_conn(file_details.db_db, file_details.db_collection, file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: event_id, sentence_id = events[event]['ids'][0].split('_') # print(event_id) result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) sents = utilities.sentence_segmenter(result['content']) query_text = sents[int(sentence_id)] geo_info = query_mordecai(query_text, geo_details.mordecai_host, geo_details.mordecai_port) try: # temporary hack: take the first location: geo_info = geo_info[0] # NA is for ADM1, which mord doesn't return. See issue #2 events[event]['geo'] = (geo_info['lon'], geo_info['lat'], geo_info['placename'], "NA", geo_info['countrycode']) except Exception as e: events[event]['geo'] = ("NA", "NA", "NA", "NA", "NA") return events
def format_content(raw_content): """ Function to process a given news story for further formatting. Calls a function that extract the story text minus the date and source line. Also splits the sentences using the ``sentence_segmenter()`` function. Parameters ---------- raw_content: String. Content of a news story as pulled from the web scraping database. Returns ------- sent_list: List. List of sentences. """ content = _get_story(raw_content) split = utilities.sentence_segmenter(content) return split
def query_all(collection, lt_date, gt_date, sources, write_file=False): """ Function to query the MongoDB instance and obtain results for the desired date range. The query constructed is: greater_than_date > results < less_than_date. Parameters ---------- collection: pymongo.collection.Collection. Collection within MongoDB that holds the scraped news stories. lt_date: Datetime object. Date for which results should be older than. For example, if the date running is the 25th, and the desired date is the 24th, then the `lt_date` is the 25th. gt_date: Datetime object. Date for which results should be older than. For example, if the date running is the 25th, and the desired date is the 24th, then the `gt_date` is the 23rd. sources: List. Sources to pull from the MongoDB instance. write_file: Boolean. Option indicating whether to write the results from the web scraper to an intermediate file. Defaults to false. Returns ------- posts: List. List of dictionaries of results from the MongoDB query. final_out: String. If `write_file` is True, this contains a string representation of the query results. Otherwise, contains an empty string. """ logger = logging.getLogger('pipeline_log') final_out = '' if write_file: output = [] sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') posts = collection.find({"$and": [{"date_added": {"$lt": lt_date}}, {"date_added": {"$gt": gt_date}}, {"source": {"$in": sources}}]}) for num, post in enumerate(posts): try: #print 'Processing entry {}...'.format(num) content = post['content'].encode('utf-8') if post['source'] == 'aljazeera': content = content.replace("""Caution iconAttention The browser or device you are using is out of date. It has known security flaws and a limited feature set. You will not see all the features of some websites. Please update your browser.""", '') header = ' '.join(utilities.sentence_segmenter(content)[:4]) string = '{}\t{}\t{}\n{}\n'.format(num, post['date'], post['url'], header) output.append(string) except Exception as e: print('Error on entry {}: {}.'.format(num, e)) final_out = '\n'.join(output) posts = collection.find({"$and": [{"date_added": {"$lte": lt_date}}, {"date_added": {"$gt": gt_date}}, {"source": {"$in": sources}}]}) print('Total number of stories: {}'.format(posts.count())) logger.info('Total number of stories: {}'.format(posts.count())) posts = list(posts) return posts, final_out
def query_all(collection, lt_date, gt_date, sources, write_file=False): """ Function to query the MongoDB instance and obtain results for the desired date range. The query constructed is: greater_than_date > results < less_than_date. Parameters ---------- collection: pymongo.collection.Collection. Collection within MongoDB that holds the scraped news stories. lt_date: Datetime object. Date for which results should be older than. For example, if the date running is the 25th, and the desired date is the 24th, then the `lt_date` is the 25th. gt_date: Datetime object. Date for which results should be older than. For example, if the date running is the 25th, and the desired date is the 24th, then the `gt_date` is the 23rd. sources: List. Sources to pull from the MongoDB instance. write_file: Boolean. Option indicating whether to write the results from the web scraper to an intermediate file. Defaults to false. Returns ------- posts: List. List of dictionaries of results from the MongoDB query. final_out: String. If `write_file` is True, this contains a string representation of the query results. Otherwise, contains an empty string. """ logger = logging.getLogger('pipeline_log') final_out = '' if write_file: output = [] sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') posts = collection.find({ "$and": [{ "date_added": { "$lt": lt_date } }, { "date_added": { "$gt": gt_date } }, { "source": { "$in": sources } }] }) for num, post in enumerate(posts): try: #print 'Processing entry {}...'.format(num) content = post['content'].encode('utf-8') if post['source'] == 'aljazeera': content = content.replace( """Caution iconAttention The browser or device you are using is out of date. It has known security flaws and a limited feature set. You will not see all the features of some websites. Please update your browser.""", '') header = ' '.join(utilities.sentence_segmenter(content)[:4]) string = '{}\t{}\t{}\n{}\n'.format(num, post['date'], post['url'], header) output.append(string) except Exception as e: print('Error on entry {}: {}.'.format(num, e)) final_out = '\n'.join(output) posts = collection.find({ "$and": [{ "date_added": { "$lte": lt_date } }, { "date_added": { "$gt": gt_date } }, { "source": { "$in": sources } }] }) print('Total number of stories: {}'.format(posts.count())) logger.info('Total number of stories: {}'.format(posts.count())) posts = list(posts) return posts, final_out
def query_all(collection, lt_date, gt_date, sources, elasticsearch, index, write_file=False): """ Function to query the MongoDB instance and obtain results for the desired date range. The query constructed is: greater_than_date > results < less_than_date. Parameters ---------- collection: pymongo.collection.Collection. Collection within MongoDB that holds the scraped news stories. lt_date: Datetime object. Date for which results should be older than. For example, if the date running is the 25th, and the desired date is the 24th, then the `lt_date` is the 25th. gt_date: Datetime object. Date for which results should be older than. For example, if the date running is the 25th, and the desired date is the 24th, then the `gt_date` is the 23rd. sources: List. Sources to pull from the MongoDB instance. write_file: Boolean. Option indicating whether to write the results from the web scraper to an intermediate file. Defaults to false. Returns ------- posts: List. List of dictionaries of results from the MongoDB query. final_out: String. If `write_file` is True, this contains a string representation of the query results. Otherwise, contains an empty string. """ logger = logging.getLogger('pipeline_log') final_out = '' if not elasticsearch: # Using elasticsearch and writing the file are incompatible at this time if write_file: output = [] #sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') posts = collection.find({"$and": [{"date_added": {"$lt": lt_date}}, {"date_added": {"$gt": gt_date}}, {"source": {"$in": sources}}]}) for num, post in enumerate(posts): try: # print 'Processing entry {}...'.format(num) content = post['content'].encode('utf-8') if post['source'] == 'aljazeera': content = content.replace( """Caution iconAttention The browser or device you are using is out of date. It has known security flaws and a limited feature set. You will not see all the features of some websites. Please update your browser.""", '') header = ' '.join(utilities.sentence_segmenter(content)[:4]) string = '{}\t{}\t{}\n{}\n'.format(num, post['date'], post['url'], header) output.append(string) except Exception as e: print('Error on entry {}: {}.'.format(num, e)) final_out = '\n'.join(output) posts = collection.find({"$and": [{"date_added": {"$lte": lt_date}}, {"date_added": {"$gt": gt_date}}, {"source": {"$in": sources}}]}) else: # Do a date range query and filter out the documents where stanford = 1. # It is questionable whether I should have the stanford=1 check here. It is an error to be running the # phoneix_pipeline on data that has not already been run through the stanford pipeline. lt_time = lt_date.strftime('%Y-%m-%dT%X.%f%z') gt_time = gt_date.strftime('%Y-%m-%dT%X.%f%z') s = Search(using=collection, index=index, doc_type="news") \ .filter("range", published_date={"lt": lt_time, "gt": gt_time}) \ .filter("term", stanford=1) posts = s.execute() if elasticsearch: print('Total number of stories: {}'.format(posts.hits.total)) logger.info('Total number of stories: {}'.format(posts.hits.total)) else: print('Total number of stories: {}'.format(posts.count())) logger.info('Total number of stories: {}'.format(posts.count())) posts = posts.hits if elasticsearch else list(posts) return posts, final_out