Ejemplo n.º 1
0
def main():
    """
    Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf
    --hostname  : Cache all active queues on this host
    --log_file  : Path to write the log file to
    --log_level : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('--hostname', metavar='HOSTNAME', type=str, default=environ.get('HOSTNAME', None),
                            help="The hostname of the machine whose services' data you wish to cache")
    arg = arg_parser.parse_args()

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)
    conf.init(arg)

    assert arg.hostname, '--hostname must be provided'
    queues = conf.get_all_cached_queues(hostname=arg.hostname)
    pool = []

    for queue in queues:
        log.info('Spawning cache process for %s' % queue)
        p = multiprocessing.Process(name=queue, target=cache_queue, args=(queue,))
        p.start()
        pool.append(p)

    try:
        for process in pool:
            process.join()
            log.warn('%s caching has stopped' % process.name)
    except KeyboardInterrupt:
        log.warn('Keyboard interrupt in main')
Ejemplo n.º 2
0
def main():
    """
    Utility for warnings stored in Elasticsearch
    --log_file     : Path to write the log file to
    --log_level    : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg = arg_parser.parse_args()

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    print(query(max_results=30))
Ejemplo n.º 3
0
def main():
    """
    Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf
    --hostname  : Cache all active queues on this host
    --log_file  : Path to write the log file to
    --log_level : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument(
        '--hostname',
        metavar='HOSTNAME',
        type=str,
        default=environ.get('HOSTNAME', None),
        help=
        "The hostname of the machine whose services' data you wish to cache")
    arg = arg_parser.parse_args()

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)
    conf.init(arg)

    assert arg.hostname, '--hostname must be provided'
    queues = conf.get_all_cached_queues(hostname=arg.hostname)
    pool = []

    for queue in queues:
        log.info('Spawning cache process for %s' % queue)
        p = multiprocessing.Process(name=queue,
                                    target=cache_queue,
                                    args=(queue, ))
        p.start()
        pool.append(p)

    try:
        for process in pool:
            process.join()
            log.warn('%s caching has stopped' % process.name)
    except KeyboardInterrupt:
        log.warn('Keyboard interrupt in main')
Ejemplo n.º 4
0
def main():
    ap = args.get_parser()
    ap.add_argument('-i', '--input', default='sys.stdin', type=str, help='Path to the input file.'
                    'Default is sys.stdin')
    ap.add_argument('-o', '--out', default='sys.stdout', type=str, help='Path to the output file.'
                    'Default is sys.stdout')
    ap.add_argument('searchPhrase', default='config/phrases.txt', type=str, help='Path to '
                    'the Phrase File if "-f" flag is specified, else the input string is considered'
                    'to be the phrase.')
    ap.add_argument('-f', '--file', action='store_true', default=False, help='If given, then the '
                    'the searchPhrase argument is interpreted as path to a file')
    global logger
    logger = logs.getLogger("%s-%s.log" % (__processor__, str(datetime.now())))
    arg = ap.parse_args()
    logs.init(args)
    inputFile = None
    outFile = None
    phraseFile = None

    if arg.input == 'sys.stdin':
        reader = codecs.getreader('utf-8')(sys.stdin)
    else:
        inputFile = open(arg.input, "r")
        reader = codecs.getreader('utf-8')(inputFile)
    if arg.out == 'sys.stdout':
        writer = codecs.getwriter('utf-8')(sys.stdout)
    else:
        outFile = codecs.open(arg.out, "w", encoding="utf-8")
        writer = codecs.getwriter('utf-8')(outFile)
    if arg.file:
        phraseFile = codecs.open(arg.searchPhrase, encoding='utf-8')
        generatePhraseList(phraseFile.readlines())
    else:
        generatePhraseList([arg.searchPhrase])
    phraseSearch(reader, writer)
    #close all files
    if inputFile:
        inputFile.close()
    if outFile:
        outFile.close()
    if phraseFile:
        phraseFile.close()
Ejemplo n.º 5
0
def main():
    """
    Utility to set up a mapping for an EMBERS queue in Elasticsearch
    -q | --queue     : Queue name to set up the mapping for. Settings are read from embers.conf
    --log_file  : Path to write the log file to
    --log_level : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('-q', '--queue', help='Queue name to map into Elasticsearch')
    arg = arg_parser.parse_args()

    assert arg.queue, '--queue must be provided'

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    add_type(index_name=general.get_index_name(), type_name=arg.queue)
Ejemplo n.º 6
0
    def __init__(self, wg_data=WG_DATA, co_admin_data=CO_ADMIN_DATA,
                 priority_policy=PRIORITY_POLICY,
                 debug=False):
        """
        """
        self.priority_policy = priority_policy
        self.debug = debug
        self.__version__ = "{0}-{1}-{2}-{3}-{4}".format(
            self.__class__.__name__,
            __version__,
            hashlib.md5(get_wg_data(wg_data).read()).hexdigest(),
            hashlib.md5(get_co_admin_data(co_admin_data).read()).hexdigest(),
            hashlib.md5(" ".join(self.priority_policy)).hexdigest())

        if self.debug:
            try:
                logs.init()
            except IOError:  # , err:
                logs.init(logfile=self.__class__.__name__.lower())

            self.log = logs.getLogger("{0}-{1}".format(
                                      self.__class__.__name__,
                                      __version__.replace('.', '_')))

        # 1. load country and admin1 level geo data
        f = get_co_admin_data(co_admin_data)
        dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t")
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect, fieldnames=CO_ADMIN_FIELDS)
        # NOTE:
        # Known conflicts b/w codes of countries and other admins
        # co Colombia ('Colombia', 'C\xc3\xb3rdoba')
        # cl Chile ('Colombia', 'Caldas')
        # ar Argentina ('Colombia', 'Arauca')
        # sv El Salvador ('El Salvador', 'San Vicente')

        # prep lookup dictionaries
        # key__value

        # countries
        self.co_code = {}
        self.co_names = {}
        self.co_aliases = {}
        self.co_capital_cities = {}
        # admin1
        self.admin_code = {}
        self.admin_name = {}
        # assumes countries appear first when reading data from
        # lac_co_admin TODO BAD!
        for r in reader:
            for k in r.keys():
                r[k] = r[k].strip()
            lat = float_or_none(r['latitude'])
            lon = float_or_none(r['longitude'])
            code = object_or_none(r['iso_3166_code'])
            rid = int_or_none(r["id"])
            if r['type'] == 'country':
                # country
                if code:
                    self.co_code[code] = r['name']
                    self.co_names[nstr(r['name'])] = (rid, lat, lon,
                                                      code, r['name'])
                    self.co_capital_cities[nstr(r['capital_city'])] =\
                        (r['capital_city'], r['name'])
                    aliases = r['alt_names'].split(',')
                    self.co_aliases.update({nstr(alias.strip()): r['name']
                                            for alias in aliases})
                else:
                    if self.debug:
                        self.log.error("Bad data country {0} Code {1}".format(
                                       r['name'], code))
            elif r['type'] == 'admin':
                # admin
                admin, co = r['full_name'].split(',')
                admin, co = admin.strip(), co.strip()

                if code:
                    if code not in self.admin_code:
                        self.admin_code[code] = []
                    self.admin_code[code].append((co, admin))
                co1, a = nstr(co), nstr(admin)
                if a not in self.admin_name:
                    self.admin_name[a] = {}
                if co1 not in self.admin_name[a]:
                    self.admin_name[a][co1] = (rid, lat, lon, code, admin, co)

        f.close()

        # 2. load (world-gazeteer) city level geo data
        f = get_wg_data(wg_data)
        dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t")
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect, fieldnames=WG_FIELDS)
        self.ci_aliases = {}
        # main data store for geocoding
        self.data = []
        counter = 0
        ci_set = set()
        for r in reader:
            for k in r.keys():
                r[k] = r[k].strip()
            # get alias names for cities
            ci_names = [a.strip() for a in r['alt_names'].split(',')
                        if len(a.strip()) > 0]
            ci_names.extend([a.strip() for a in r['orig_names'].split(',')
                             if len(a.strip()) > 0])
            for ci in ci_names:
                k = (nstr(ci), nstr(r['country']))
                a1 = nstr(r['admin1'])
                if k not in self.ci_aliases:
                    self.ci_aliases[k] = {a1: set([r['name']])}
                elif a1 not in self.ci_aliases[k]:
                    self.ci_aliases[k][a1] = set([r['name']])
                else:
                    # Cases where different cities for same
                    # admin-country pair have the same alias
                    self.ci_aliases[k][a1].add(r['name'])
                # add ci name aliases into ci_set
                ci_set.add(nstr(ci))
            # store only cannonical cities names
            self.data.append((counter, (r['name'], r['country'],
                              r['admin1'],
                              object_or_none(r['admin2']),
                              object_or_none(r['admin3']),
                              int_or_none(r['pop']),
                              float_or_none(r['latitude']) / 100,
                              float_or_none(r['longitude']) / 100,
                              int(r['id']), int(r['padded']))))
            counter += 1

        self.coordinates = {}
        # cases where admin1 and city share the same name
        # extended feature/hack #1 to resolve city when
        # only country and admin1 are specified
        self.same_ci_a1_name = {}
        for i, (n, c, a1, a2, a3, p, lat, lon, i_d, pad) in self.data:
            nn, nc, na1 = nstr(n),  nstr(c), nstr(a1)
            self.coordinates[(lat, lon)] = i
            if nn == na1 and pad == 0:
                self.same_ci_a1_name[(nc, na1)] = n
            ci_set.add(nn)

        # store (lat, lon)
        self.kdtree = KDTree([[i, j] for i, j in self.coordinates.keys()
                              if i is not None and j is not None])
        # build regular expr dicts
        co_set = set(self.co_names.keys())
        # add country name aliases into co_set
        co_set.update(self.co_aliases.keys())
        self.co_reg = ManyRE(co_set)
        self.ci_reg = ManyRE(ci_set)
        # add admin1 name aliases into admin1_set
        admin1_set = set(self.admin_name.keys())
        # build regular expression stores for co-admin1-ci
        self.admin1_reg = ManyRE(admin1_set)
        # add stopwords to prevent any 2-letter word in common usage
        # to be mis-interpretted as country or admin code
        two_letter_stop_words = set(
            ['BE', 'WE', '\xc3\xa0', 'YO', 'DO', 'YA', 'DE', 'DA', 'HA', 'BY',
             'HE', 'AL', 'NI', 'LE', 'NO', 'LO', 'TU', 'TO', 'TI', 'TE', 'EM',
             'EL', 'EN', 'IS', 'OS', 'AM', 'IT', 'AO', 'AN', 'AS', 'AT', 'IN',
             'EU', 'ES', 'IF', 'ME', 'ON', 'OF', 'LA', 'MI', 'UP', 'SU', 'UM',
             'UN', 'SO', 'NA', 'OU', 'MY', 'OR', 'SE', 'US'])

        self.co_code_reg = ManyRE([sw for sw in self.co_code.keys()
                                  if sw not in two_letter_stop_words])
        self.admin1_code_reg1 = ManyRE(self.admin_code.keys())
        self.admin1_code_reg2 = ManyRE([sw for sw in self.admin_code.keys()
                                       if sw not in two_letter_stop_words])

        self.bguess = {}
        for i, (city, country, admin1, a2, a3, p, la, lo, i_d, pad)\
                in self.data:

            ci, co, a = nstr(city), nstr(country), nstr(admin1)
            # value is list of admin1's that correspond to ci-co key
            # ci-co makes dictionary flatter
            # choose not to use co-admin1-ci as key to add more flexibility
            # for lookups
            if ci in self.bguess:
                if co in self.bguess[ci]:
                    if a in self.bguess[ci][co]:
                        # store original wg-records marked with pad = 0
                        # to head of the queue
                        if pad == 0:
                            self.bguess[ci][co][a].appendleft(i)
                        else:
                            self.bguess[ci][co][a].append(i)
                    else:
                        self.bguess[ci][co][a] = deque([i])
                else:
                    self.bguess[ci][co] = {a: deque([i])}
            else:
                self.bguess[ci] = {co: {a: deque([i])}}
Ejemplo n.º 7
0
#!/usr/bin/env python

import sys
import json
from etool import args, logs, queue
from embers.geocode import Geo, GEO_REGION, PRIORITY_POLICY as LA_POLICY
from embers.geocode_mena import GeoMena, PRIORITY_POLICY as MENA_POLICY
from embers.utils import normalize_str

__processor__ = 'geo_code_stream.py'
log = logs.getLogger('%s.log' % (__processor__))

LOC_HEADERS = (u"geocode_version", u"city", u"country", u"admin1", u"admin2", u"admin3", u"pop",
               u"latitude", u"longitude", u"id", u"pad", u"source")


def decode(s, encoding='utf-8'):
    try:
        return s.decode(encoding=encoding)
    except:
        return s


def get_geoInfo(tweet, geo):
    geotuple = [decode(geo.__version__)] + [decode(l) for l in geo.geo_normalize(tweet)]
    return dict(zip(LOC_HEADERS, geotuple))


def isempty(s):
    """return if string is empty
    """
Ejemplo n.º 8
0
def main():
    """
    Utility to cache messages from a queue into Elasticsearch
    -q | --queue   : Read from <queue> and write the messages to Elasticsearch. Settings are read from embers.conf
    --log_file     : Path to write the log file to
    --log_level    : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('-q',
                            '--queue',
                            help='Queue name to index into Elasticsearch')
    arg_parser.add_argument(
        '-s',
        '--s3fromq',
        action='store_true',
        help='ingest from S3 prefix derived from queue name')
    arg_parser.add_argument('-p', '--prefix', help='Ingest from prefix')
    #arg_parser.add_argument('-t', '--typename', default='noqueue', help='Type for prefix ingest')
    arg_parser.add_argument('-t', '--typename', help='Type for prefix ingest')
    arg_parser.add_argument(
        '-l',
        '--tmpcopy',
        default='/home/embers/data/tmpcopy',
        help='Name of local copy of S3 file (same for all S3 files)')
    arg_parser.add_argument('-c',
                            '--chunk',
                            type=int,
                            default=100,
                            help='Chunk size for S3 ingest')
    arg_parser.add_argument('-i',
                            '--clustername',
                            help='Clustername to determine index name')
    arg_parser.add_argument(
        '-w',
        '--withbase',
        action="store_true",
        help="Add basename to prefix when looking for type.")
    arg_parser.add_argument('--startdate',
                            help='start date in format like 2015-01-02')
    arg_parser.add_argument('--enddate',
                            help='end date in format like 2015-01-02')
    arg = arg_parser.parse_args()

    #assert (arg.queue or (arg.prefix and arg.typename)), 'Either --queue (with optional --s3fromq/--typename) or --prefix with --typename must be provided'
    assert (
        arg.queue or arg.prefix
    ), 'Either --queue (with optional --s3fromq/--typename) or --prefix  must be provided'

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    index_name = general.get_index_name(arg.clustername)

    queue.init()

    if arg.prefix or (arg.queue and arg.s3fromq):
        if arg.prefix:
            prefix = arg.prefix
            # get queue name or its substitute for S3 objects from prefix
            if arg.typename:
                type_name = arg.typename
            else:
                type_name = queue.conf.get_prefixpair(
                    prefix=prefix, includeS3=True, withBasename=arg.withbase)
                if not type_name:
                    log.error("Could not get type from prefix %s" % prefix)
                    return 1
                log.warning("type_name=%s from prefix=%s" %
                            (type_name, prefix))
        else:
            type_name = arg.queue
            prefix, include = queue.conf.get_prefix_for_queue(
                type_name, withBasename=False)
            if not prefix:
                log.error("Could not get S3 prefix for queue %s" % type_name)
                return 1

        if not general.get_es_connection().indices.exists_type(
                index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        conn_s3 = boto.connect_s3(aws_access_key_id=arg.aws_key,
                                  aws_secret_access_key=arg.aws_secret)
        bucket = conn_s3.get_bucket(
            arg.bucket)  # connect to S3, get bucket ptr for arg.bucket
        attach_to_s3(index_name,
                     s3prefix=prefix,
                     bucket=bucket,
                     type_name=type_name,
                     tmpcopy=arg.tmpcopy,
                     chunk_size=arg.chunk,
                     startdate=arg.startdate,
                     enddate=arg.enddate)
    else:

        if arg.typename:
            type_name = arg.typename
        else:
            type_name = arg.queue

        if not general.get_es_connection().indices.exists_type(
                index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        attach_to_queue(index_name=index_name,
                        queue_name=arg.queue,
                        type_name=type_name)
Ejemplo n.º 9
0
#!/usr/bin/env python

import os
import sys
import codecs
import json
from etool import args, logs, queue, iqueue
from geo2.country import GeoCountry

__processor__ = 'geo_code_stream.py'
log = logs.getLogger("geo_code_stream")  # '%s.log' % (__processor__))


def annotate(msg, geoc, filter_region=None):
    """
    Annotate message with geocountry info
    Params:
    msg - dict object
    geoc - GeoCountry object
    filter_region - region to be filtered
    """
    content = geoc.annotate(msg)
    content_region = content.get("embersGeoCodeCountry", {}).get("region", None)
    if content_region is None:
        return None

    if filter_region is not None and filter_region != content_region:
        return None

    return content
Ejemplo n.º 10
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

__author__ = "Wei Wang"
__email__ = "*****@*****.**"

from etool import logs, queue, args
import json
from datetime import datetime

__processor__ = 'listen_warning'
logs.getLogger(__processor__)
SENT_WARNINGS = []


def check_ifexist(warning):
    eventDate = warning["eventDate"]
    eventType = warning["eventType"]
    population = warning["population"]

    if [eventDate, eventType, population] in SENT_WARNINGS:
        return True
    else:
        SENT_WARNINGS.append([eventDate, eventType, population])
        return False


def main():
    ap = args.get_parser()
    ap.add_argument('--out', help="the output file of warnings")
    arg = ap.parse_args()
Ejemplo n.º 11
0
__author__ = 'mogren'
"""
General Elasticsearch queries for Twitter feeds
"""

import sys
import general as es
import re
from etool import logs

log = logs.getLogger(__name__)

twitter_date_field = 'date'
twitter_text_field = 'text'


def get_tweets(keywords=None,
               geo_box=None,
               start_time=None,
               end_time=None,
               max_results=10):
    """
    Retrieves all tweets containing the keywords provided in the provided time frame. If no parameters are provided,
    this function will return the 10 most recent tweets in the index.

    If end_time is not provided and start_time is, end_time will be the present time.

    :param keywords: str, list of strings, or dict - {'<field_name>': '<value>'} or {'<field_name>': [<list_of_values>}
    :param geo_box: dict {'lat': {'min': <value>, 'max': <value>}, 'lng':{'min': <value>, 'max': <value>}}
    :param start_time: ISO formatted date string
    :param end_time: ISO formatted date string
Ejemplo n.º 12
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-

__author__ = 'Michael Shuffett'
__email__ = '*****@*****.**'

# from datetime import datetime
# from datetime import timedelta
import urllib2
import json
from etool import args, logs, queue, message
import os

log = logs.getLogger('wikipedia_recent_changes')


class API(object):
    """Wikipedia API class"""

    def __init__(self, localization="es"):
        self._url = "http://%s.wikipedia.org/w/api.php" % localization
        self._max_ids = 50

    def get_recent_changes(self, namespace=0):
        """Queries Wikipedia site for the latest changes.

        :param integer namespace: the namespace to restrict to, defaults to 0.
        :returns: a list of changes sorted in reverse chronological order.
        :rtype: list
        """
        url = "%s?action=query&list=recentchanges&format=json&rclimit=max&rcnamespace=%d" % (self._url, namespace)
Ejemplo n.º 13
0
#!/usr/bin/env python

import os
import sys
import codecs
import json
from etool import args, logs, queue, iqueue
from geo2.country import GeoCountry

__processor__ = 'geo_code_stream.py'
log = logs.getLogger("geo_code_stream")  # '%s.log' % (__processor__))


def annotate(msg, geoc, filter_region=None):
    """
    Annotate message with geocountry info
    Params:
    msg - dict object
    geoc - GeoCountry object
    filter_region - region to be filtered
    """
    content = geoc.annotate(msg)
    content_region = content.get("embersGeoCodeCountry",
                                 {}).get("region", None)
    if content_region is None:
        return None

    if filter_region is not None and filter_region != content_region:
        return None

    return content
Ejemplo n.º 14
0
def testLog():
    __processor__ = 'TestLog'
    log = logs.getLogger(__processor__)
    logs.init()
    log.info("Error: %s" % "I'm Here")
Ejemplo n.º 15
0
#!/usr/bin/env python

import sys
import json
from etool import args, logs, queue
from embers.geocode import Geo, GEO_REGION, PRIORITY_POLICY as LA_POLICY
from embers.geocode_mena import GeoMena, PRIORITY_POLICY as MENA_POLICY
from embers.utils import normalize_str

__processor__ = 'geo_code_stream.py'
log = logs.getLogger('%s.log' % (__processor__))

LOC_HEADERS = (u"geocode_version", u"city", u"country", u"admin1", u"admin2",
               u"admin3", u"pop", u"latitude", u"longitude", u"id", u"pad",
               u"source")


def decode(s, encoding='utf-8'):
    try:
        return s.decode(encoding=encoding)
    except:
        return s


def get_geoInfo(tweet, geo):
    geotuple = [decode(geo.__version__)
                ] + [decode(l) for l in geo.geo_normalize(tweet)]
    return dict(zip(LOC_HEADERS, geotuple))


def isempty(s):
Ejemplo n.º 16
0
__author__ = 'mogren'
"""
General caching service
"""

from etool import conf
from os import environ
import sys
from etool import logs
import multiprocessing
from etool.cache.elastic.cache import cache_queue

log = logs.getLogger(__name__)


def main():
    """
    Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf
    --hostname  : Cache all active queues on this host
    --log_file  : Path to write the log file to
    --log_level : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('--hostname', metavar='HOSTNAME', type=str, default=environ.get('HOSTNAME', None),
                            help="The hostname of the machine whose services' data you wish to cache")
    arg = arg_parser.parse_args()

    log = logs.getLogger(log_name=arg.log_file)
Ejemplo n.º 17
0
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#
#

from etool import args, logs, queue
import os
import codecs
import time
import json

log = logs.getLogger('test_publisher')
"""
test_publisher.py

Arguments (required):
--pub		the feed to publish to
--json_file	the JSON file to publish messages to

Arguments (optional):
--ssh_key	the private key to use to tunnel to EMBERS
--tunnel	the host to tunnel to

test_publisher.py will:
- Continuously read from a file
- Publish each JSON message to the specified queue
- once it reaches EOF, start again
Ejemplo n.º 18
0
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#  
#  

from etool import args, logs, queue, message
import os
import codecs
import json
import subprocess
import socket
from time import sleep

log = logs.getLogger('psl_harness')

"""
psl_harness.py

Arguments (required):
--sub			the feed to subscribe to
--pub			the feed to publish to
--local_port	local port to forward and receive messages 

Arguments (optional):
--ssh_key		the private key to use to tunnel to EMBERS
--tunnel		the host to tunnel to

psl_harness.py will:
- Continuously read from a queue
Ejemplo n.º 19
0
#  GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
#  MA 02110-1301, USA.
#  
#  

from etool import args, logs, queue
import os
import codecs
import time
import json

log = logs.getLogger('test_publisher')

"""
test_publisher.py

Arguments (required):
--pub		the feed to publish to
--json_file	the JSON file to publish messages to

Arguments (optional):
--ssh_key	the private key to use to tunnel to EMBERS
--tunnel	the host to tunnel to

test_publisher.py will:
- Continuously read from a file
- Publish each JSON message to the specified queue
Ejemplo n.º 20
0
import sqlite3 as lite
from Util import common
import json
from datetime import datetime
import hashlib
from etool import logs
import sys
import argparse
# import history raw data into database    
con = None
cur = None
__processor__ = "ImportArchivedNews"
log = logs.getLogger(__processor__)

def init():
    global con
    global cur
    
    con = common.getDBConnection()
    cur = con.cursor()
    logs.init()

def insert_news(article):
    try:
        global con
        global cur
        
        sql = "insert into t_daily_news(embers_id,title,author,post_time,post_date,content,stock_index,source,update_time,url) values (?,?,?,?,?,?,?,?,?,?)"
        embersId = article["embersId"]
        title = article["title"]
        author = article["author"]
Ejemplo n.º 21
0
def main():
    """
    Utility to cache messages from a queue into Elasticsearch
    -q | --queue   : Read from <queue> and write the messages to Elasticsearch. Settings are read from embers.conf
    --log_file     : Path to write the log file to
    --log_level    : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('-q', '--queue', help='Queue name to index into Elasticsearch')
    arg_parser.add_argument('-s', '--s3fromq', action='store_true', help='ingest from S3 prefix derived from queue name')
    arg_parser.add_argument('-p', '--prefix', help='Ingest from prefix')
    #arg_parser.add_argument('-t', '--typename', default='noqueue', help='Type for prefix ingest')
    arg_parser.add_argument('-t', '--typename', help='Type for prefix ingest')
    arg_parser.add_argument('-l', '--tmpcopy', default='/home/embers/data/tmpcopy',help='Name of local copy of S3 file (same for all S3 files)')
    arg_parser.add_argument('-c', '--chunk', type=int, default=100,help='Chunk size for S3 ingest')
    arg_parser.add_argument('-i', '--clustername', help='Clustername to determine index name')
    arg_parser.add_argument('-w', '--withbase', action="store_true", help="Add basename to prefix when looking for type.")
    arg_parser.add_argument('--startdate', help='start date in format like 2015-01-02')
    arg_parser.add_argument('--enddate', help='end date in format like 2015-01-02')
    arg = arg_parser.parse_args()

    #assert (arg.queue or (arg.prefix and arg.typename)), 'Either --queue (with optional --s3fromq/--typename) or --prefix with --typename must be provided'
    assert (arg.queue or arg.prefix ), 'Either --queue (with optional --s3fromq/--typename) or --prefix  must be provided'

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    index_name = general.get_index_name(arg.clustername)

    queue.init()

    if arg.prefix or (arg.queue and arg.s3fromq):
        if arg.prefix:
            prefix = arg.prefix
            # get queue name or its substitute for S3 objects from prefix
            if arg.typename:
                type_name = arg.typename
            else:
                type_name = queue.conf.get_prefixpair(prefix=prefix,includeS3=True,withBasename=arg.withbase)
                if not type_name:
                    log.error("Could not get type from prefix %s" % prefix)
                    return 1
                log.warning("type_name=%s from prefix=%s" % (type_name, prefix))
        else:
            type_name = arg.queue
            prefix, include = queue.conf.get_prefix_for_queue(type_name, withBasename=False)
            if not prefix:
                log.error("Could not get S3 prefix for queue %s" % type_name)
                return 1

        if not general.get_es_connection().indices.exists_type(index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        conn_s3 = boto.connect_s3(aws_access_key_id=arg.aws_key, aws_secret_access_key=arg.aws_secret)
        bucket = conn_s3.get_bucket(arg.bucket)	 # connect to S3, get bucket ptr for arg.bucket
        attach_to_s3(index_name, 
                     s3prefix=prefix, 
                     bucket=bucket, 
                     type_name=type_name, 
                     tmpcopy=arg.tmpcopy, 
                     chunk_size=arg.chunk,
                     startdate=arg.startdate,
                     enddate=arg.enddate)
    else:

        if arg.typename:
            type_name=arg.typename
        else:
            type_name=arg.queue

        if not general.get_es_connection().indices.exists_type(index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        attach_to_queue(index_name=index_name, queue_name=arg.queue, type_name=type_name)