Ejemplo n.º 1
0
def main(events, file_details):
    """
    Pulls out a database ID and runs the ``query_geotext`` function to hit the
    GeoVista Center's GeoText API and find location information within the
    sentence.

    Parameters
    ----------

    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.

    Returns
    -------

    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a tuple of the form (LAT, LON).
    """
    coll = utilities.make_conn(file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        lon, lat, name = query_geotext(query_text)
        if lat and lon:
            events[event]['geo'] = (lon, lat, name)

    return events
Ejemplo n.º 2
0
def print_sentence(events, file_details, server_details, geo_details):
    """
    Returns the sentence instead of calling a geo service
    Parameters
    ----------
     events: Dictionary.
                Contains filtered events from the one-a-day filter. Keys are
                (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
                IDs, sources, and issues.
    Returns
    -------
    events: Dictionary.
                Same as in the parameter but with the addition of a value that is
                a list of lon, lat, placeName, stateName, countryCode.
    """
    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
                               file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
        # print(event_id)
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        events[event]['geosentence'] = sents[int(sentence_id)]
        events[event]['geo'] = ("$TBD", "$TBD", "$TBD", "$TBD", "$TBD")

    return events
Ejemplo n.º 3
0
def main(events, file_details):
    """
    Pulls out a database ID and runs the ``query_geotext`` function to hit the
    GeoVista Center's GeoText API and find location information within the
    sentence.

    Parameters
    ----------

    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.

    Returns
    -------

    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a tuple of the form (LAT, LON).
    """
    coll = utilities.make_conn(file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        lon, lat, name = query_geotext(query_text)
        if lat and lon:
            events[event]['geo'] = (lon, lat, name)

    return events
Ejemplo n.º 4
0
def cliff(events, file_details, server_details, geo_details):
    """
    Pulls out a database ID and runs the ``query_cliff`` function to hit MIT's
    CLIFF/CLAVIN geolocation system running locally and find location
    information within the sentence. Note, this function calls back to the database
    where stories are stored.
    Parameters
    ----------
    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.
    Returns
    -------
    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a list of lon, lat, placeName, stateName, countryCode.
    """
    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
                               file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
        # print(event_id)
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        geo_info = query_cliff(query_text, geo_details.cliff_host,
                               geo_details.cliff_port)
        if geo_info:
            try:
                if geo_info['countryCode'] != "":
                    geo_info['countryCode'] = iso_convert(
                        geo_info['countryCode'])
            except:
                logger.warning("""Error converting country codes.""")
            events[event]['geo'] = (geo_info['lon'], geo_info['lat'],
                                    geo_info['placeName'],
                                    geo_info['stateName'],
                                    geo_info['countryCode'])
            # Add in country and restype here
    return events
Ejemplo n.º 5
0
def cliff(events, file_details, server_details, geo_details):
    """
    Pulls out a database ID and runs the ``query_cliff`` function to hit MIT's
    CLIFF/CLAVIN geolocation system running locally and find location
    information within the sentence. Note, this function calls back to the database
    where stories are stored.
    Parameters
    ----------
    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.
    Returns
    -------
    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a list of lon, lat, placeName, stateName, countryCode.
    """
    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
                               file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
       # print(event_id)
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        geo_info = query_cliff(query_text, geo_details.cliff_host,
                               geo_details.cliff_port)
        if geo_info:
            try:
                if geo_info['countryCode'] != "":
                    geo_info['countryCode'] = iso_convert(geo_info['countryCode'])
            except:
                logger.warning("""Error converting country codes.""")
            events[event]['geo'] = (geo_info['lon'], geo_info['lat'],
                                    geo_info['placeName'],
                                    geo_info['stateName'],
                                    geo_info['countryCode'])
            # Add in country and restype here
    return events
Ejemplo n.º 6
0
def mordecai(events, file_details, server_details, geo_details):
    """
    Pulls out a database ID and queries the Mordecai geolocation system 
    running locally  and find location information within the sentence.
    Parameters
    ----------
    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.
    Returns
    -------
    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a list of lon, lat, placeName, stateName, countryCode.
    """
    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
                               file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
        # print(event_id)
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        geo_info = query_mordecai(query_text, geo_details.mordecai_host,
                                  geo_details.mordecai_port)
        try:
            # temporary hack: take the first location:
            geo_info = geo_info[0]
            # NA is for ADM1, which mord doesn't return. See issue #2
            events[event]['geo'] = (geo_info['lon'], geo_info['lat'],
                                    geo_info['placename'], "NA",
                                    geo_info['countrycode'])
        except Exception as e:
            events[event]['geo'] = ("NA", "NA", "NA", "NA", "NA")

    return events
Ejemplo n.º 7
0
def mordecai(events, file_details, server_details, geo_details):
    """
    Pulls out a database ID and queries the Mordecai geolocation system 
    running locally  and find location information within the sentence.
    Parameters
    ----------
    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.
    Returns
    -------
    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a list of lon, lat, placeName, stateName, countryCode.
    """
    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
                               file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
       # print(event_id)
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        geo_info = query_mordecai(query_text, geo_details.mordecai_host,
                               geo_details.mordecai_port)
        try:
            # temporary hack: take the first location:
            geo_info = geo_info[0]
            # NA is for ADM1, which mord doesn't return. See issue #2
            events[event]['geo'] = (geo_info['lon'], geo_info['lat'],
                              geo_info['placename'], "NA", geo_info['countrycode'])
        except Exception as e:
            events[event]['geo'] = ("NA", "NA", "NA", "NA", "NA")

    return events
Ejemplo n.º 8
0
def main(current_date, file_details, write_file=False, file_stem=None):
    """
    Function to create a connection to a MongoDB instance, query for a given
    day's results, optionally write the results to a file, and return the
    results.

    Parameters
    ----------

    current_date: datetime object.
                    Date for which records are pulled. Normally this is
                    $date_running - 1. For example, if the script is running on
                    the 25th, the current_date will be the 24th.

    file_details: Named tuple.
                    Tuple containing config information.

    write_file: Boolean.
                Option indicating whether to write the results from the web
                scraper to an intermediate file. Defaults to false.

    file_stem: String. Optional.
                Optional string defining the file stem for the intermediate
                file for the scraper results.

    Returns
    -------

    posts: Dictionary.
            Dictionary of results from the MongoDB query.

    filename: String.
                If `write_file` is True, contains the filename to which the
                scraper results are writen. Otherwise is an empty string.

    """
    sources = _get_sources('source_keys.txt')
    conn = utilities.make_conn(file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    less_than = datetime.datetime(current_date.year, current_date.month,
                                  current_date.day)
    greater_than = less_than - datetime.timedelta(days=1)
    less_than = less_than + datetime.timedelta(days=1)

    results, text = query_all(conn, less_than, greater_than, sources,
                              write_file=write_file)

    filename = ''
    if text:
        text = text.decode('utf-8')

        if file_stem:
            filename = '{}{:02d}{:02d}{:02d}.txt'.format(file_stem,
                                                         current_date.year,
                                                         current_date.month,
                                                         current_date.day)
            with codecs.open(filename, 'w', encoding='utf-8') as f:
                f.write(text)
        else:
            print('Need filestem to write results to file.')

    return results, filename
Ejemplo n.º 9
0
def main(current_date, file_details, write_file=False, file_stem=None):
    """
    Function to create a connection to a MongoDB instance, query for a given
    day's results, optionally write the results to a file, and return the
    results.

    Parameters
    ----------

    current_date: datetime object.
                    Date for which records are pulled. Normally this is
                    $date_running - 1. For example, if the script is running on
                    the 25th, the current_date will be the 24th.

    file_details: Named tuple.
                    Tuple containing config information.

    write_file: Boolean.
                Option indicating whether to write the results from the web
                scraper to an intermediate file. Defaults to false.

    file_stem: String. Optional.
                Optional string defining the file stem for the intermediate
                file for the scraper results.

    Returns
    -------

    posts: Dictionary.
            Dictionary of results from the MongoDB query.

    filename: String.
                If `write_file` is True, contains the filename to which the
                scraper results are writen. Otherwise is an empty string.

    """
    sources = _get_sources('source_keys.txt')
    conn = utilities.make_conn(file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    less_than = datetime.datetime(current_date.year, current_date.month,
                                  current_date.day)
    greater_than = less_than - datetime.timedelta(days=1)
    less_than = less_than + datetime.timedelta(days=1)

    results, text = query_all(conn, less_than, greater_than, sources,
                              write_file=write_file)

    filename = ''
    if text:
        text = text.decode('utf-8')

        if file_stem:
            filename = '{}{:02d}{:02d}{:02d}.txt'.format(file_stem,
                                                         current_date.year,
                                                         current_date.month,
                                                         current_date.day)
            with codecs.open(filename, 'w', encoding='utf-8') as f:
                f.write(text)
        else:
            print('Need filestem to write results to file.')

    return results, filename