def main(events, file_details): """ Pulls out a database ID and runs the ``query_geotext`` function to hit the GeoVista Center's GeoText API and find location information within the sentence. Parameters ---------- events: Dictionary. Contains filtered events from the one-a-day filter. Keys are (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of IDs, sources, and issues. Returns ------- events: Dictionary. Same as in the parameter but with the addition of a value that is a tuple of the form (LAT, LON). """ coll = utilities.make_conn(file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: event_id, sentence_id = events[event]['ids'][0].split('_') result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) sents = utilities.sentence_segmenter(result['content']) query_text = sents[int(sentence_id)] lon, lat, name = query_geotext(query_text) if lat and lon: events[event]['geo'] = (lon, lat, name) return events
def print_sentence(events, file_details, server_details, geo_details): """ Returns the sentence instead of calling a geo service Parameters ---------- events: Dictionary. Contains filtered events from the one-a-day filter. Keys are (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of IDs, sources, and issues. Returns ------- events: Dictionary. Same as in the parameter but with the addition of a value that is a list of lon, lat, placeName, stateName, countryCode. """ coll = utilities.make_conn(file_details.db_db, file_details.db_collection, file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: event_id, sentence_id = events[event]['ids'][0].split('_') # print(event_id) result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) sents = utilities.sentence_segmenter(result['content']) events[event]['geosentence'] = sents[int(sentence_id)] events[event]['geo'] = ("$TBD", "$TBD", "$TBD", "$TBD", "$TBD") return events
def cliff(events, file_details, server_details, geo_details): """ Pulls out a database ID and runs the ``query_cliff`` function to hit MIT's CLIFF/CLAVIN geolocation system running locally and find location information within the sentence. Note, this function calls back to the database where stories are stored. Parameters ---------- events: Dictionary. Contains filtered events from the one-a-day filter. Keys are (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of IDs, sources, and issues. Returns ------- events: Dictionary. Same as in the parameter but with the addition of a value that is a list of lon, lat, placeName, stateName, countryCode. """ coll = utilities.make_conn(file_details.db_db, file_details.db_collection, file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: event_id, sentence_id = events[event]['ids'][0].split('_') # print(event_id) result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) sents = utilities.sentence_segmenter(result['content']) query_text = sents[int(sentence_id)] geo_info = query_cliff(query_text, geo_details.cliff_host, geo_details.cliff_port) if geo_info: try: if geo_info['countryCode'] != "": geo_info['countryCode'] = iso_convert( geo_info['countryCode']) except: logger.warning("""Error converting country codes.""") events[event]['geo'] = (geo_info['lon'], geo_info['lat'], geo_info['placeName'], geo_info['stateName'], geo_info['countryCode']) # Add in country and restype here return events
def cliff(events, file_details, server_details, geo_details): """ Pulls out a database ID and runs the ``query_cliff`` function to hit MIT's CLIFF/CLAVIN geolocation system running locally and find location information within the sentence. Note, this function calls back to the database where stories are stored. Parameters ---------- events: Dictionary. Contains filtered events from the one-a-day filter. Keys are (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of IDs, sources, and issues. Returns ------- events: Dictionary. Same as in the parameter but with the addition of a value that is a list of lon, lat, placeName, stateName, countryCode. """ coll = utilities.make_conn(file_details.db_db, file_details.db_collection, file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: event_id, sentence_id = events[event]['ids'][0].split('_') # print(event_id) result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) sents = utilities.sentence_segmenter(result['content']) query_text = sents[int(sentence_id)] geo_info = query_cliff(query_text, geo_details.cliff_host, geo_details.cliff_port) if geo_info: try: if geo_info['countryCode'] != "": geo_info['countryCode'] = iso_convert(geo_info['countryCode']) except: logger.warning("""Error converting country codes.""") events[event]['geo'] = (geo_info['lon'], geo_info['lat'], geo_info['placeName'], geo_info['stateName'], geo_info['countryCode']) # Add in country and restype here return events
def mordecai(events, file_details, server_details, geo_details): """ Pulls out a database ID and queries the Mordecai geolocation system running locally and find location information within the sentence. Parameters ---------- events: Dictionary. Contains filtered events from the one-a-day filter. Keys are (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of IDs, sources, and issues. Returns ------- events: Dictionary. Same as in the parameter but with the addition of a value that is a list of lon, lat, placeName, stateName, countryCode. """ coll = utilities.make_conn(file_details.db_db, file_details.db_collection, file_details.auth_db, file_details.auth_user, file_details.auth_pass) for event in events: event_id, sentence_id = events[event]['ids'][0].split('_') # print(event_id) result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])}) sents = utilities.sentence_segmenter(result['content']) query_text = sents[int(sentence_id)] geo_info = query_mordecai(query_text, geo_details.mordecai_host, geo_details.mordecai_port) try: # temporary hack: take the first location: geo_info = geo_info[0] # NA is for ADM1, which mord doesn't return. See issue #2 events[event]['geo'] = (geo_info['lon'], geo_info['lat'], geo_info['placename'], "NA", geo_info['countrycode']) except Exception as e: events[event]['geo'] = ("NA", "NA", "NA", "NA", "NA") return events
def main(current_date, file_details, write_file=False, file_stem=None): """ Function to create a connection to a MongoDB instance, query for a given day's results, optionally write the results to a file, and return the results. Parameters ---------- current_date: datetime object. Date for which records are pulled. Normally this is $date_running - 1. For example, if the script is running on the 25th, the current_date will be the 24th. file_details: Named tuple. Tuple containing config information. write_file: Boolean. Option indicating whether to write the results from the web scraper to an intermediate file. Defaults to false. file_stem: String. Optional. Optional string defining the file stem for the intermediate file for the scraper results. Returns ------- posts: Dictionary. Dictionary of results from the MongoDB query. filename: String. If `write_file` is True, contains the filename to which the scraper results are writen. Otherwise is an empty string. """ sources = _get_sources('source_keys.txt') conn = utilities.make_conn(file_details.auth_db, file_details.auth_user, file_details.auth_pass) less_than = datetime.datetime(current_date.year, current_date.month, current_date.day) greater_than = less_than - datetime.timedelta(days=1) less_than = less_than + datetime.timedelta(days=1) results, text = query_all(conn, less_than, greater_than, sources, write_file=write_file) filename = '' if text: text = text.decode('utf-8') if file_stem: filename = '{}{:02d}{:02d}{:02d}.txt'.format(file_stem, current_date.year, current_date.month, current_date.day) with codecs.open(filename, 'w', encoding='utf-8') as f: f.write(text) else: print('Need filestem to write results to file.') return results, filename