def main(events, file_details):
    """
    Pulls out a database ID and runs the ``query_geotext`` function to hit the
    GeoVista Center's GeoText API and find location information within the
    sentence.

    Parameters
    ----------

    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.

    Returns
    -------

    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a tuple of the form (LAT, LON).
    """
    coll = utilities.make_conn(file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        lon, lat, name = query_geotext(query_text)
        if lat and lon:
            events[event]['geo'] = (lon, lat, name)

    return events
Beispiel #2
0
def main(events, file_details):
    """
    Pulls out a database ID and runs the ``query_geotext`` function to hit the
    GeoVista Center's GeoText API and find location information within the
    sentence.

    Parameters
    ----------

    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.

    Returns
    -------

    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a tuple of the form (LAT, LON).
    """
    coll = utilities.make_conn(file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        lon, lat, name = query_geotext(query_text)
        if lat and lon:
            events[event]['geo'] = (lon, lat, name)

    return events
Beispiel #3
0
def print_sentence(events, file_details, server_details, geo_details):
    """
    Returns the sentence instead of calling a geo service
    Parameters
    ----------
     events: Dictionary.
                Contains filtered events from the one-a-day filter. Keys are
                (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
                IDs, sources, and issues.
    Returns
    -------
    events: Dictionary.
                Same as in the parameter but with the addition of a value that is
                a list of lon, lat, placeName, stateName, countryCode.
    """
    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
                               file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
        # print(event_id)
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        events[event]['geosentence'] = sents[int(sentence_id)]
        events[event]['geo'] = ("$TBD", "$TBD", "$TBD", "$TBD", "$TBD")

    return events
Beispiel #4
0
def cliff(events, file_details, server_details, geo_details):
    """
    Pulls out a database ID and runs the ``query_cliff`` function to hit MIT's
    CLIFF/CLAVIN geolocation system running locally and find location
    information within the sentence. Note, this function calls back to the database
    where stories are stored.
    Parameters
    ----------
    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.
    Returns
    -------
    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a list of lon, lat, placeName, stateName, countryCode.
    """
    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
                               file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
        # print(event_id)
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        geo_info = query_cliff(query_text, geo_details.cliff_host,
                               geo_details.cliff_port)
        if geo_info:
            try:
                if geo_info['countryCode'] != "":
                    geo_info['countryCode'] = iso_convert(
                        geo_info['countryCode'])
            except:
                logger.warning("""Error converting country codes.""")
            events[event]['geo'] = (geo_info['lon'], geo_info['lat'],
                                    geo_info['placeName'],
                                    geo_info['stateName'],
                                    geo_info['countryCode'])
            # Add in country and restype here
    return events
def cliff(events, file_details, server_details, geo_details):
    """
    Pulls out a database ID and runs the ``query_cliff`` function to hit MIT's
    CLIFF/CLAVIN geolocation system running locally and find location
    information within the sentence. Note, this function calls back to the database
    where stories are stored.
    Parameters
    ----------
    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.
    Returns
    -------
    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a list of lon, lat, placeName, stateName, countryCode.
    """
    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
                               file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
       # print(event_id)
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        geo_info = query_cliff(query_text, geo_details.cliff_host,
                               geo_details.cliff_port)
        if geo_info:
            try:
                if geo_info['countryCode'] != "":
                    geo_info['countryCode'] = iso_convert(geo_info['countryCode'])
            except:
                logger.warning("""Error converting country codes.""")
            events[event]['geo'] = (geo_info['lon'], geo_info['lat'],
                                    geo_info['placeName'],
                                    geo_info['stateName'],
                                    geo_info['countryCode'])
            # Add in country and restype here
    return events
Beispiel #6
0
def mordecai(events, file_details, server_details, geo_details):
    """
    Pulls out a database ID and queries the Mordecai geolocation system 
    running locally  and find location information within the sentence.
    Parameters
    ----------
    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.
    Returns
    -------
    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a list of lon, lat, placeName, stateName, countryCode.
    """
    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
                               file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
        # print(event_id)
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        geo_info = query_mordecai(query_text, geo_details.mordecai_host,
                                  geo_details.mordecai_port)
        try:
            # temporary hack: take the first location:
            geo_info = geo_info[0]
            # NA is for ADM1, which mord doesn't return. See issue #2
            events[event]['geo'] = (geo_info['lon'], geo_info['lat'],
                                    geo_info['placename'], "NA",
                                    geo_info['countrycode'])
        except Exception as e:
            events[event]['geo'] = ("NA", "NA", "NA", "NA", "NA")

    return events
def mordecai(events, file_details, server_details, geo_details):
    """
    Pulls out a database ID and queries the Mordecai geolocation system 
    running locally  and find location information within the sentence.
    Parameters
    ----------
    events: Dictionary.
            Contains filtered events from the one-a-day filter. Keys are
            (DATE, SOURCE, TARGET, EVENT) tuples, values are lists of
            IDs, sources, and issues.
    Returns
    -------
    events: Dictionary.
            Same as in the parameter but with the addition of a value that is
            a list of lon, lat, placeName, stateName, countryCode.
    """
    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
                               file_details.auth_db, file_details.auth_user,
                               file_details.auth_pass)

    for event in events:
        event_id, sentence_id = events[event]['ids'][0].split('_')
       # print(event_id)
        result = coll.find_one({'_id': ObjectId(event_id.split('_')[0])})
        sents = utilities.sentence_segmenter(result['content'])

        query_text = sents[int(sentence_id)]
        geo_info = query_mordecai(query_text, geo_details.mordecai_host,
                               geo_details.mordecai_port)
        try:
            # temporary hack: take the first location:
            geo_info = geo_info[0]
            # NA is for ADM1, which mord doesn't return. See issue #2
            events[event]['geo'] = (geo_info['lon'], geo_info['lat'],
                              geo_info['placename'], "NA", geo_info['countrycode'])
        except Exception as e:
            events[event]['geo'] = ("NA", "NA", "NA", "NA", "NA")

    return events
Beispiel #8
0
def format_content(raw_content):
    """
    Function to process a given news story for further formatting. Calls
    a function that extract the story text minus the date and source line. Also
    splits the sentences using the ``sentence_segmenter()`` function.

    Parameters
    ----------

    raw_content: String.
                    Content of a news story as pulled from the web scraping
                    database.

    Returns
    -------

    sent_list: List.
                List of sentences.

    """
    content = _get_story(raw_content)
    split = utilities.sentence_segmenter(content)
    return split
Beispiel #9
0
def format_content(raw_content):
    """
    Function to process a given news story for further formatting. Calls
    a function that extract the story text minus the date and source line. Also
    splits the sentences using the ``sentence_segmenter()`` function.

    Parameters
    ----------

    raw_content: String.
                    Content of a news story as pulled from the web scraping
                    database.

    Returns
    -------

    sent_list: List.
                List of sentences.

    """
    content = _get_story(raw_content)
    split = utilities.sentence_segmenter(content)
    return split
def query_all(collection, lt_date, gt_date, sources, write_file=False):
    """
    Function to query the MongoDB instance and obtain results for the desired
    date range. The query constructed is: greater_than_date > results
    < less_than_date.

    Parameters
    ----------

    collection: pymongo.collection.Collection.
                Collection within MongoDB that holds the scraped news stories.

    lt_date: Datetime object.
                    Date for which results should be older than. For example,
                    if the date running is the 25th, and the desired date is
                    the 24th, then the `lt_date` is the 25th.

    gt_date: Datetime object.
                        Date for which results should be older than. For
                        example, if the date running is the 25th, and the
                        desired date is the 24th, then the `gt_date`
                        is the 23rd.

    sources: List.
                Sources to pull from the MongoDB instance.

    write_file: Boolean.
                Option indicating whether to write the results from the web
                scraper to an intermediate file. Defaults to false.

    Returns
    -------

    posts: List.
            List of dictionaries of results from the MongoDB query.


    final_out: String.
                If `write_file` is True, this contains a string representation
                of the query results. Otherwise, contains an empty string.

    """

    logger = logging.getLogger('pipeline_log')
    final_out = ''
    if write_file:
        output = []
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        posts = collection.find({"$and": [{"date_added": {"$lt": lt_date}},
                                          {"date_added": {"$gt": gt_date}},
                                          {"source": {"$in": sources}}]})
        for num, post in enumerate(posts):
            try:
                #print 'Processing entry {}...'.format(num)
                content = post['content'].encode('utf-8')
                if post['source'] == 'aljazeera':
                    content = content.replace("""Caution iconAttention The browser or device you are using is out of date.  It has known security flaws and a limited feature set.  You will not see all the features of some websites.  Please update your browser.""", '')
                header = '  '.join(utilities.sentence_segmenter(content)[:4])
                string = '{}\t{}\t{}\n{}\n'.format(num, post['date'],
                                                   post['url'], header)
                output.append(string)
            except Exception as e:
                print('Error on entry {}: {}.'.format(num, e))
        final_out = '\n'.join(output)

    posts = collection.find({"$and": [{"date_added": {"$lte": lt_date}},
                                      {"date_added": {"$gt": gt_date}},
                                      {"source": {"$in": sources}}]})

    print('Total number of stories: {}'.format(posts.count()))
    logger.info('Total number of stories: {}'.format(posts.count()))

    posts = list(posts)

    return posts, final_out
def query_all(collection, lt_date, gt_date, sources, write_file=False):
    """
    Function to query the MongoDB instance and obtain results for the desired
    date range. The query constructed is: greater_than_date > results
    < less_than_date.

    Parameters
    ----------

    collection: pymongo.collection.Collection.
                Collection within MongoDB that holds the scraped news stories.

    lt_date: Datetime object.
                    Date for which results should be older than. For example,
                    if the date running is the 25th, and the desired date is
                    the 24th, then the `lt_date` is the 25th.

    gt_date: Datetime object.
                        Date for which results should be older than. For
                        example, if the date running is the 25th, and the
                        desired date is the 24th, then the `gt_date`
                        is the 23rd.

    sources: List.
                Sources to pull from the MongoDB instance.

    write_file: Boolean.
                Option indicating whether to write the results from the web
                scraper to an intermediate file. Defaults to false.

    Returns
    -------

    posts: List.
            List of dictionaries of results from the MongoDB query.


    final_out: String.
                If `write_file` is True, this contains a string representation
                of the query results. Otherwise, contains an empty string.

    """

    logger = logging.getLogger('pipeline_log')
    final_out = ''
    if write_file:
        output = []
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        posts = collection.find({
            "$and": [{
                "date_added": {
                    "$lt": lt_date
                }
            }, {
                "date_added": {
                    "$gt": gt_date
                }
            }, {
                "source": {
                    "$in": sources
                }
            }]
        })
        for num, post in enumerate(posts):
            try:
                #print 'Processing entry {}...'.format(num)
                content = post['content'].encode('utf-8')
                if post['source'] == 'aljazeera':
                    content = content.replace(
                        """Caution iconAttention The browser or device you are using is out of date.  It has known security flaws and a limited feature set.  You will not see all the features of some websites.  Please update your browser.""",
                        '')
                header = '  '.join(utilities.sentence_segmenter(content)[:4])
                string = '{}\t{}\t{}\n{}\n'.format(num, post['date'],
                                                   post['url'], header)
                output.append(string)
            except Exception as e:
                print('Error on entry {}: {}.'.format(num, e))
        final_out = '\n'.join(output)

    posts = collection.find({
        "$and": [{
            "date_added": {
                "$lte": lt_date
            }
        }, {
            "date_added": {
                "$gt": gt_date
            }
        }, {
            "source": {
                "$in": sources
            }
        }]
    })

    print('Total number of stories: {}'.format(posts.count()))
    logger.info('Total number of stories: {}'.format(posts.count()))

    posts = list(posts)

    return posts, final_out
def query_all(collection, lt_date, gt_date, sources, elasticsearch, index, write_file=False):
    """
    Function to query the MongoDB instance and obtain results for the desired
    date range. The query constructed is: greater_than_date > results
    < less_than_date.

    Parameters
    ----------

    collection: pymongo.collection.Collection.
                Collection within MongoDB that holds the scraped news stories.

    lt_date: Datetime object.
                    Date for which results should be older than. For example,
                    if the date running is the 25th, and the desired date is
                    the 24th, then the `lt_date` is the 25th.

    gt_date: Datetime object.
                        Date for which results should be older than. For
                        example, if the date running is the 25th, and the
                        desired date is the 24th, then the `gt_date`
                        is the 23rd.

    sources: List.
                Sources to pull from the MongoDB instance.

    write_file: Boolean.
                Option indicating whether to write the results from the web
                scraper to an intermediate file. Defaults to false.

    Returns
    -------

    posts: List.
            List of dictionaries of results from the MongoDB query.


    final_out: String.
                If `write_file` is True, this contains a string representation
                of the query results. Otherwise, contains an empty string.

    """

    logger = logging.getLogger('pipeline_log')
    final_out = ''
    if not elasticsearch:
        # Using elasticsearch and writing the file are incompatible at this time
        if write_file:
            output = []
            #sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
            posts = collection.find({"$and": [{"date_added": {"$lt": lt_date}},
                                              {"date_added": {"$gt": gt_date}},
                                              {"source": {"$in": sources}}]})
            for num, post in enumerate(posts):
                try:
                    # print 'Processing entry {}...'.format(num)
                    content = post['content'].encode('utf-8')
                    if post['source'] == 'aljazeera':
                        content = content.replace(
                            """Caution iconAttention The browser or device you are using is out of date.  It has known security flaws and a limited feature set.  You will not see all the features of some websites.  Please update your browser.""",
                            '')
                    header = '  '.join(utilities.sentence_segmenter(content)[:4])
                    string = '{}\t{}\t{}\n{}\n'.format(num, post['date'],
                                                       post['url'], header)
                    output.append(string)
                except Exception as e:
                    print('Error on entry {}: {}.'.format(num, e))
            final_out = '\n'.join(output)

        posts = collection.find({"$and": [{"date_added": {"$lte": lt_date}},
                                          {"date_added": {"$gt": gt_date}},
                                          {"source": {"$in": sources}}]})
    else:
        # Do a date range query and filter out the documents where stanford = 1.
        # It is questionable whether I should have the stanford=1 check here.  It is an error to be running the
        # phoneix_pipeline on data that has not already been run through the stanford pipeline.
        lt_time = lt_date.strftime('%Y-%m-%dT%X.%f%z')
        gt_time = gt_date.strftime('%Y-%m-%dT%X.%f%z')
        s = Search(using=collection, index=index, doc_type="news") \
            .filter("range", published_date={"lt": lt_time, "gt": gt_time}) \
            .filter("term", stanford=1)
        posts = s.execute()

    if elasticsearch:
        print('Total number of stories: {}'.format(posts.hits.total))
        logger.info('Total number of stories: {}'.format(posts.hits.total))
    else:
        print('Total number of stories: {}'.format(posts.count()))
        logger.info('Total number of stories: {}'.format(posts.count()))

    posts = posts.hits if elasticsearch else list(posts)

    return posts, final_out