コード例 #1
0
ファイル: App.py プロジェクト: junwen29/GDELT
def setup_directories():
    config = config_utils.get_app_config()
    if not os.path.exists(config["app"]["data_directory"]):
        data_directory = config["app"]["data_directory"]
        data_directory.replace('\\', os.sep).replace('/', os.sep)
        os.makedirs(data_directory)

    if not os.path.exists(config["logging"]["log_directory"]):
        log_directory = config["logging"]["log_directory"]
        log_directory.replace('\\', os.sep).replace('/', os.sep)
        os.makedirs(log_directory)

    if not os.path.exists(config["gdelt"]["in_process_csv_directory"]):
        in_process_csv_directory = config["gdelt"]["in_process_csv_directory"]
        in_process_csv_directory.replace('\\', os.sep).replace('/', os.sep)
        os.makedirs(in_process_csv_directory)

    if not os.path.exists(config["gdelt"]["processed_csv_directory"]):
        processed_csv_directory = config["gdelt"]["processed_csv_directory"]
        processed_csv_directory.replace('\\', os.sep).replace('/', os.sep)
        os.makedirs(processed_csv_directory)

    if not os.path.exists(config["app"]["json_directory"]):
        json_directory = config["app"]["json_directory"]
        json_directory.replace('\\', os.sep).replace('/', os.sep)
        os.makedirs(json_directory)

    if not os.path.exists(config["app"]["ib_directory"]):
        ib_directory = config["app"]["ib_directory"]
        ib_directory.replace('\\', os.sep).replace('/', os.sep)
        os.makedirs(ib_directory)

    if not os.path.exists(config["app"]["xml_directory"]):
        xml_directory = config["app"]["xml_directory"]
        xml_directory.replace('\\', os.sep).replace('/', os.sep)
        os.makedirs(xml_directory)
コード例 #2
0
def get_xml_tree(list_of_events):
    logger.info("GENERATING TREE ...")
    config = config_utils.get_app_config()

    root_element = ElementTree.Element('ns2:opsdashboard')
    root_element.set("xmlns:ns2", "http://www.asd.qwe.rt/sdf")

    routing_info = ElementTree.SubElement(root_element, 'routinginfo')
    sender = ElementTree.SubElement(routing_info, 'sender')
    sender.text = 'OA_FS_SENDER'

    recipient = ElementTree.SubElement(routing_info, 'recipient')
    recipient.text = 'FN_FS_Receiver_1'

    priority = ElementTree.SubElement(routing_info, 'priority')
    priority.text = 'normal'

    template = ElementTree.SubElement(routing_info, 'template')
    template.text = 'Events.xsd'

    # checksum = ET.SubElement(root_element, 'checksum')

    ts = time.time()
    created_datetime = datetime.datetime.fromtimestamp(ts).strftime(
        '%Y-%m-%d %H:%M:%S')
    timestamp = ElementTree.SubElement(root_element, 'timestamp')
    timestamp.text = created_datetime

    count = ElementTree.SubElement(root_element, 'count')
    count.text = str(len(list_of_events))

    keywords = ElementTree.SubElement(root_element, 'keywords')
    keyword = ElementTree.SubElement(keywords, 'keyword')
    keyword.text = '*'

    events = ElementTree.SubElement(root_element, 'events')
    for event_object in list_of_events:
        event_node = ElementTree.SubElement(events, 'event')

        title = ElementTree.SubElement(event_node, 'title')
        title.text = event_object["title"]

        the_content = " "
        if len(event_object["content"]) > 0:
            the_content = event_object["content"]
        content = ElementTree.SubElement(event_node, 'content')
        content.text = the_content

        if event_object["content_paragraphs"] is not None:
            keywords_of_event = ElementTree.SubElement(event_node,
                                                       'content_paragraphs')
            for paragraph in event_object["content_paragraphs"]:
                keyword_of_event = ElementTree.SubElement(
                    keywords_of_event, 'paragraph')
                keyword_of_event.text = paragraph

        if event_object["hit_list"] is not None:
            keywords_of_event = ElementTree.SubElement(event_node, 'keywords')
            for word in event_object["hit_list"]:
                keyword_of_event = ElementTree.SubElement(
                    keywords_of_event, 'keyword')
                keyword_of_event.text = word

        if event_object["probable_event_dates"] is not None:
            keywords_of_event = ElementTree.SubElement(event_node,
                                                       'probable_event_dates')
            for word in event_object["probable_event_dates"]:
                keyword_of_event = ElementTree.SubElement(
                    keywords_of_event, 'probable_event_date')
                keyword_of_event.text = word

        country = ElementTree.SubElement(event_node, 'country')
        country.text = event_object["country"]

        lat = ElementTree.SubElement(event_node, 'lat')
        lat.text = event_object["lat"]

        lng = ElementTree.SubElement(event_node, 'lng')
        lng.text = event_object["lng"]

        source = ElementTree.SubElement(event_node, 'source')
        source.text = event_object["source"]
        created_datetime = ElementTree.SubElement(event_node,
                                                  'created_datetime')
        created_datetime.text = event_object["created_datetime"]

        categories = ElementTree.SubElement(event_node, 'categories')
        for category_object in event_object["categories"]:
            category_node = ElementTree.SubElement(categories, 'category')
            category_node.text = category_object["category"]

        authors = ElementTree.SubElement(event_node, 'authors')
        for author_object in event_object["authors"]:
            author_node = ElementTree.SubElement(authors, 'author')
            author_node.text = author_object

    ts = time.time()
    created_datetime = datetime.datetime.fromtimestamp(ts).strftime(
        '%Y_%m_%d_%H%M%S')

    xml_str = minidom.parseString(
        ElementTree.tostring(root_element)).toprettyxml()
    with open(
            config["app"]["xml_directory"] + os.sep + created_datetime +
            "_for_ib.xml", "wb") as f:
        f.write(xml_str.encode('utf-8'))
        f.close()

    logger.info("DONE: GENERATING TREE ...")
コード例 #3
0
def get_json(list_of_events):
    logger.info("Generating Elasticsearch JSON for bulk indexing...")

    es_json_list = list()

    ts = time.time()
    config = config_utils.get_app_config()
    index_name = config["elasticsearch"]["events_index_name"]
    index_type = config["elasticsearch"]["events_index_type"]
    created_datetime = datetime.datetime.fromtimestamp(ts).strftime(
        '%Y_%m_%d_%H%M%S')
    json_file_path = config["app"][
        "json_directory"] + os.sep + created_datetime + "_for_es.json"
    csv_file_path = config["app"][
        "ib_directory"] + os.sep + created_datetime + "_for_ib.csv"

    logger.info(json_file_path)
    logger.info(csv_file_path)  # csv file to carry the json across the ib

    for event_object in list_of_events:

        categories = list()
        for c in event_object["categories"]:
            categories.append(c["category"])

        if event_object["hit_list"] is not None:
            for c in event_object["hit_list"]:
                categories.append(c)

        lng_lat = [
            float(event_object["lng"]),
            float(event_object["lat"])
        ]  # each event should hold only 1 location, create multiple events if the same event is held at
        # other places at the same time

        countries_list = list()
        countries_list.append(event_object["country"])

        new_event_object = {
            "created_date_time": event_object["created_datetime"],
            "location": lng_lat,
            "source": event_object["source"],
            "probable_event_dates": event_object["probable_event_dates"],
            "categories": categories,
            "countries": countries_list,
            "title": event_object["title"],
            "content": event_object["content"],
            "content_paragraphs": event_object["content_paragraphs"],
            "authors": event_object["authors"]
        }
        es_json_list.append(new_event_object)

        # By default json is the output
        with open(json_file_path, 'a') as json_file:
            json.dump(
                {
                    "index": {
                        "_index": index_name,
                        "_type": index_type,
                        "_id": generate_id(event_object["title"])
                    }
                }, json_file)
            json_file.write("\n")
            json.dump(new_event_object, json_file)
            json_file.write("\n\n")
            json_file.close()

    with open(csv_file_path, 'a') as csv_file:
        with open(json_file_path, 'r') as the_file:
            csv_file.write(the_file.read())
            the_file.close()
            csv_file.close()

    logger.info("Completed generating Elasticsearch JSON for bulk indexing")
コード例 #4
0
def main():
    config = config_utils.get_app_config()
    data_directory_path = config["app"]["data_directory"]
    logs_directory_path = config["logging"]["log_directory"]
    shutil.rmtree(data_directory_path)
    shutil.rmtree(logs_directory_path)
コード例 #5
0
import zipfile
from datetime import timedelta
from os.path import isfile, join

import datefinder
import requests
from bs4 import BeautifulSoup
from goose3 import Goose

import App
from gdelt_countries_mapping import countries_mapping
from gdelt_events_mapping import event_codes_mapping
from gdelt_headers import headers
from utils import config_utils, events_utils

config = config_utils.get_app_config()

browser_headers = config["gdelt"]["browser"]["headers"]
browser_timeout = config["gdelt"]["browser"]["time_out"]
base_codes_we_want = config["gdelt"]["event_base_codes_we_want"]
event_codes_to_exclude = config["gdelt"]["event_codes_to_exclude"]
countries_we_want = config["gdelt"]["countries_we_want"]
keywords_we_want = config["gdelt"]["keywords_we_want"]
is_delta_crawl = config["gdelt"]["is_delta_crawl"]
max_urls_to_crawl = config["gdelt"]["max_csv_urls_to_crawl"]
month_abbreviations = config["gdelt"]["month_abbreviations"]
months_of_year = config["gdelt"]["months_of_year"]

in_process_csv_directory = (
    config["gdelt"]["in_process_csv_directory"]).replace('\\', os.sep).replace(
        '/', os.sep)
コード例 #6
0
ファイル: gdacs.py プロジェクト: junwen29/GDELT
def run():
    logger.info("Running GDACS script")
    config = config_utils.get_app_config()
    rss_24h_feed_url = config["gdacs"]["rss_24h_feed_url"]
    try:
        if config["proxy"]["enabled"].lower() == "true":
            proxy_handler = urllib.request.ProxyHandler({
                "http":
                config["proxy"]["http_ip_port"],
                "https":
                config["proxy"]["https_ip_port"]
            })
            logger.info("Added proxy handler")
            gdacs_feed = feedparser.parse(rss_24h_feed_url,
                                          handlers=[proxy_handler])
        else:
            gdacs_feed = feedparser.parse(rss_24h_feed_url)

        logger.info('Getting disasters from GDACS ...')
        items = gdacs_feed['entries']
        events_list = list()

        logger.info(
            "Number of disasters(s) from GDACS 24 hours rss feed to process: {}"
            .format(len(items)))
        num_errors = 0

        for i in range(len(items)):
            try:
                logger.info(
                    "Processing #{} disaster fromm GDACS feed ...".format(i +
                                                                          1))
                item = items[i]
                event_date = item["published"]
                logger.debug(event_date)

                # Mon, 26 Nov 2018 01:19:41 GMT
                event_date = datetime.datetime.strptime(
                    event_date, '%a, %d %b %Y %H:%M:%S %Z').strftime('%Y%m%d')
                logger.debug(event_date)
                title = item["title"]
                description = item["description"]
                content_paragraph_list = list()
                content_paragraph_list.append(description)
                lat = item["geo_lat"]
                lng = item["geo_long"]
                country = item["gdacs_country"]
                event_type = item["gdacs_eventtype"]
                if event_type in event_types:
                    event_type = event_types[event_type]
                else:
                    event_type = "445"
                source = item["link"]

                ts = time.time()
                created_datetime = datetime.datetime.fromtimestamp(
                    ts).strftime('%Y-%m-%d %H:%M:%S')
                probable_event_date_list = list()
                probable_event_date_list.append(
                    datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d'))
                category_list = list()
                event_str = event_codes_mapping[event_type]
                category_list.append({"category": event_str})
                author_list = list()
                author_list.append(
                    "Global Disaster Alerts & Coordination System (GDAC)")
                event_object = events_utils.generate_event(
                    title, description, content_paragraph_list, source,
                    created_datetime, probable_event_date_list, country, lat,
                    lng, category_list, author_list)
                logger.info(
                    "Completed processing #{} disaster fromm GDACS feed".
                    format(i + 1))
                events_list.append(event_object)
                logger.info('Currently {} event(s) built'.format(
                    len(events_list)))
            except Exception:
                logger.exception(
                    "Failed to process #{} disaster fromm GDACS feed ...".
                    format(i + 1))
                num_errors += 1
                continue

        events_utils.get_json(events_list)

        if config["gdelt"]["generate_xml_files"]:
            events_utils.get_xml_tree(events_list)

        logger.info('\n\n#### Summary of GDACS events ###')
        logger.info('Number of disasters from GDACS = {}'.format(len(items)))
        logger.info('Number of events generated from GDACS = {}'.format(
            len(events_list)))
        logger.info('Number of erroneous disasters= {}\n'.format(num_errors))
    except Exception:
        logger.exception("Failed to capture RSS feed from GDACS")