Ejemplo n.º 1
0
def main():
    ap = args.get_parser()
    ap.add_argument('--test', action="store_true", help="Test Flag, if contain this argument, it means a test case")
    arg = ap.parse_args()

    assert arg.sub, 'Need a queue to subscribe to'
    assert arg.pub, 'Need a queue to publish to'

    logs.init(arg)
    queue.init(arg)
    test_flag = arg.test

    conn = boto.connect_sdb()

    with queue.open(arg.sub, 'r') as inq:
        for m in inq:
            try:
                durationProcess(conn, m, arg.pub, test_flag)
            except KeyboardInterrupt:
                log.info('GOT SIGINT, exiting!')
                break
            except EmbersException as e:
                log.exception(e.value)
            except:
                log.exception("Unexpected exception in process")
Ejemplo n.º 2
0
def main():
    # Initialize arguments
    argparser = args.get_parser()
    argparser.add_argument('--json_file',
                           help='JSON file to publish',
                           required=True)
    arg = argparser.parse_args()

    queue.init(arg)
    writer = queue.open(arg.pub,
                        'pub',
                        ssh_key=arg.ssh_key,
                        ssh_conn=arg.tunnel)

    try:
        msg_reader = codecs.open(arg.json_file, encoding='utf-8', mode='r')
        message = msg_reader.readline()
        while message:
            writer.write(json.loads(message))
            message = msg_reader.readline()

        msg_reader.close()
    except KeyboardInterrupt:
        pass

    return 0
Ejemplo n.º 3
0
def main():
	# Initialize arguments
	argparser = args.get_parser()
	argparser.add_argument('--local_port', help='Local port to connect to java server', required=True)
	arg = argparser.parse_args()
		
	localPort = int(arg.local_port)

	# Initialize log
	logs.init(arg)
	global log
	
	# Initialize the queue with arguments and connect to the specified feed
	log.info("Opening and connecting to queue %s", arg.sub)
	queue.init(arg)
	reader = queue.open(arg.sub, 'sub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel)
	
	# Initialize the writer to publish to a queue
	log.info("Publishing to queue %s", arg.pub)
	writer = queue.open(arg.pub, 'pub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel)
	

	count = 0
	# Connect to Java server
	while True:
		for feedmsg in reader:
			try:
				while True:
					try:
						sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
						sock.connect(("localhost", localPort))
						break
					except:
						log.info("Unable to connect to local server")

				log.debug("Connected to java server on port %d" % localPort)

				socketLines = sock.makefile()

				# Clean the message to fix irregularities
				feedmsg = message.clean(feedmsg)

				log.debug("Read message %d. Sending to java" % count)
				# Write message to socket stream
				sock.sendall(json.dumps(feedmsg))
				sock.sendall('\n')

				# Receive result from socket stream
				result = socketLines.readline()
				writer.write(json.dumps(result))
				count += 1

				sock.close()
			except KeyboardInterrupt:
				sys.exit(1)
			else:
				log.info("Server was disconnected.")
Ejemplo n.º 4
0
def main():
    #initiate parameters
    global TREND_RANGE
    "Initiate the TimeZone Setting"

    arg = parse_args()
    conn = boto.connect_sdb()
    operate_date = arg.operate_date
    start_date = arg.start_date
    end_date = arg.end_date

    port = arg.pub
    assert port, "Need a queue to publish to"

    logs.init(arg)
    queue.init(arg)

    t_domain = get_domain(conn, 't_enriched_bloomberg_prices')

    #trend_file = args.trend_file
    # "Load the trend changeType range file"
    trendObject = None
    trendObject = json.load(sys.stdin)

    # "Get the latest version of Trend Ranage"
    trend_versionNum = max([int(v) for v in trendObject.keys()])
    # "To avoid changing the initiate values, we first transfer the json obj to string ,then load it to create a news object"
    TREND_RANGE = json.loads(json.dumps(trendObject[str(trend_versionNum)]))

    # "If input a date range, then we will handle all the data query from those days"
    if start_date is None:
        #get raw price list
        raw_price_list = []
        rs = get_raw_data(conn, operate_date)
        for r in rs:
            raw_price_list.append(r)
        for raw_data in raw_price_list:
            process(t_domain, port, raw_data)
    else:
        t_format = "%Y-%m-%d"
        s_date = datetime.strptime(start_date, t_format)
        e_date = datetime.strptime(end_date, t_format)
        while s_date <= e_date:
            raw_price_list = []
            rs = get_raw_data(conn, datetime.strftime(s_date, t_format))
            for r in rs:
                raw_price_list.append(r)
            for raw_data in raw_price_list:
                process(t_domain, port, raw_data)
            s_date = s_date + timedelta(days=1)
            # "sleep 5 s to wait simpleDB to commit"
            time.sleep(5)

    #"Write back the trendFile"
    new_version_num = trend_versionNum + 1
    trendObject[str(new_version_num)] = TREND_RANGE
    json.dump(trendObject, sys.stdout)
Ejemplo n.º 5
0
def test():
    queue.init()
    port = 'tcp://*:30115'
    with queue.open(port,'w',capture=True) as outq:
        msgObj = {'embersId': 'f0c030a20e28a12134d9ad0e98fd0861fae7438b', 'confidence': 0.13429584033181682, 'strength': '4', 'derivedFrom': [u'5df18f77723885a12fa6943421c819c90c6a2a02', u'be031c4dcf3eb9bba2d86870683897dfc4ec4051', u'3c6571a4d89b17ed01f1345c80cf2802a8a02b7b'], 'shiftDate': '2011-08-08', 'shiftType': 'Trend', 'location': u'Colombia', 'date': '2012-10-03', 'model': 'Finance Stock Model', 'valueSpectrum': 'changePercent', 'confidenceIsProbability': True, 'population': 'COLCAP'}
        outq.write(msgObj)
    
    print "Success"
    pathName = os.path.dirname(sys.argv[0])
    print pathName
Ejemplo n.º 6
0
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument(
        '--cat',
        action="store_true",
        help='Read input from standard in and write to standard out.')
    arg = ap.parse_args()
    logs.init(arg)
    geo_mena = GeoMena()
    geo_lac = Geo(geo_region=GEO_REGION.lac)
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = sys.stdout
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    geo_annotate(tweet, geo_mena, geo_lac)
                    if tweet is not None:
                        outs.write(
                            json.dumps(tweet,
                                       ensure_ascii=False).encode("utf-8"))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', (entry, ))

        else:
            queue.init(arg)
            with queue.open(arg.sub, 'r') as inq:
                with queue.open(arg.pub, 'w', capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = geo_annotate(tweet, geo_mena, geo_lac)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".',
                                          (tweet, ))

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{}".format(str(e)))
        return 1
Ejemplo n.º 7
0
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument('--cat', action="store_true",
                    help='Read input from standard in and write to standard out.')
    ap.add_argument('--region', metavar='REGION', type=str, default=None,
                    help='Specify region to filter by')
    arg = ap.parse_args()
    logs.init(arg)
    filter_region = arg.region
    geoc = GeoCountry()
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = codecs.getwriter('utf-8')(sys.stdout)
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    tweet = annotate(tweet, geoc, filter_region)
                    if tweet is not None:
                        outs.write(json.dumps(tweet, ensure_ascii=False))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', entry)

        else:
            queue.init(arg)
            iqueue.init(arg)
            qname = "{}-geoCountry-{}".format(os.environ["CLUSTERNAME"], filter_region)
            with iqueue.open(arg.sub, 'r', qname=qname) as inq:
                with queue.open(arg.pub, 'w') as outq:  # , capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = annotate(tweet, geoc, filter_region)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".', tweet)

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{0!s}.".format(e))
        return 1
Ejemplo n.º 8
0
def main():
    ap = args.get_parser()
    ap.add_argument('--replay', action="store_true", help="Test Flag, if contain this argument, it means a test case")
    #if the rule file is not indicated in argument, it need to be load from sys.stdin
    ap.add_argument('--rulefile', type=str, help="The rule file for duration analysis model")
    arg = ap.parse_args()

    if not arg.replay:
        assert arg.sub, 'Need a queue to subscribe to'
    assert arg.pub, 'Need a queue to publish to'

    logs.init(arg)
    queue.init(arg)
    test_flag = arg.replay
    if arg.rulefile:
        rule = eval(open(arg.rulefile).read())
    else:
        #load the rules from sys.stdin
        rule = eval(sys.stdin.read())

    conn = boto.connect_sdb()

    if not arg.replay:
        with queue.open(arg.sub, 'r') as inq:
            for m in inq:
                try:
                    replayIO = StringIO.StringIO()
                    durationProcess(rule, conn, m, arg.pub, test_flag, replayIO)
                except KeyboardInterrupt:
                    log.info('GOT SIGINT, exiting!')
                    break
                except EmbersException as e:
                    log.exception(e.value)
                except:
                    log.exception("Unexpected exception in process")
    else:
        #replay model take enriched file as input
        enrich_messages = sys.stdin.readlines()
        for m in enrich_messages:
            m = json.loads(m.strip())
            try:
                replayIO = StringIO.StringIO()
                durationProcess(rule, conn, m, arg.pub, test_flag, replayIO)
            except KeyboardInterrupt:
                log.info('GOT SIGINT, exiting!')
                break
            except EmbersException as e:
                log.exception(e.value)
            except:
                log.exception("Unexpected exception in process")
Ejemplo n.º 9
0
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument('--cat', action="store_true",
                    help='Read input from standard in and write to standard out.')
    arg = ap.parse_args()
    logs.init(arg)
    geo_mena = GeoMena()
    geo_lac = Geo(geo_region=GEO_REGION.lac)
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = sys.stdout
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    geo_annotate(tweet, geo_mena, geo_lac)
                    if tweet is not None:
                        outs.write(json.dumps(tweet, ensure_ascii=False).encode("utf-8"))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', (entry,))

        else:
            queue.init(arg)
            with queue.open(arg.sub, 'r') as inq:
                with queue.open(arg.pub, 'w', capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = geo_annotate(tweet, geo_mena, geo_lac)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".', (tweet,))

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{}".format(str(e)))
        return 1
Ejemplo n.º 10
0
def attach_to_queue(index_name, queue_name, type_name=None, limit=None):
    """
    Attaches to the queue_name provided and inserts the messages into Elasticsearch
    :param index_name:
    :param queue_name:
    :param limit:
    :return:
    """
    queue.init()
    log.debug('Attempting to attach to the queue %s' % queue_name)
    with queue.open(name=queue_name, mode='r') as message_queue:
        if limit:
            batch_messages(iterable_obj=message_queue, es_index_name=index_name, es_type=type_name, limit=limit)
        else:
            return push(iterable_obj=message_queue, es_index_name=index_name, es_type=type_name)
Ejemplo n.º 11
0
def main():
    ap = args.get_parser()
    ap.add_argument('--f', type=str, help='the newes file')

    arg = ap.parse_args()

    assert arg.f, 'Need a file to ingest'
    assert arg.pub, 'Need a queue to publish'

    logs.init(arg)
    queue.init(arg)

    with queue.open(arg.pub, 'w') as q_w, open(arg.f, 'r') as f_r:
        for line in f_r:
            news = json.loads(line)
            q_w.write(news)
Ejemplo n.º 12
0
def main():
    ap = args.get_parser()
    ap.add_argument('--out', help="the output file of warnings")
    arg = ap.parse_args()

    assert arg.sub, 'Need a queue to subcribe!'
    assert arg.out, 'Need a file to store warnings!'

    logs.init(arg)
    queue.init(arg)
    out_file =  arg.out

    with queue.open(arg.sub, 'r') as q_r:
        for m in q_r:
            with open(out_file, "a") as out_w:
                if not check_ifexist(m):
                    out_w.write(json.dumps(m) + "\n")
                else:
                    print "Duplicated Warnings"
Ejemplo n.º 13
0
def main():
    ap = args.get_parser()
    ap.add_argument('--out', help="the output file of warnings")
    arg = ap.parse_args()

    assert arg.sub, 'Need a queue to subcribe!'
    assert arg.out, 'Need a file to store warnings!'

    logs.init(arg)
    queue.init(arg)
    out_file = arg.out

    with queue.open(arg.sub, 'r') as q_r:
        for m in q_r:
            with open(out_file, "a") as out_w:
                if not check_ifexist(m):
                    out_w.write(json.dumps(m) + "\n")
                else:
                    print "Duplicated Warnings"
Ejemplo n.º 14
0
def main():
    svm_twitter = SVM_Twitter(0.1, 0.1, 'rbf')
    ap = args.get_parser()
    ap.add_argument("--pca_num", default=8, type=int)
    ap.add_argument("--net", type=str)
    ap.add_argument("--k", type=int)
    ap.add_argument("--inf", type=str, help="input folder")
    ap.add_argument("--o_surr", type=str, help="output surrogate file")
    arg = ap.parse_args()
    folder = {
        "t": "content",
        "c": "comprehend",
        "u": "user2user",
        "e": "entity"
    }

    assert arg.pub, "Please input a queue to publish surrogate"
    queue.init(arg)
    send_queue = queue.open(arg.pub, "w")
    surr_w = open(arg.o_surr, "w")
    for country in COUNTRY:
        train_file = os.path.join(
            arg.inf, "%s_train_%d" % (country.replace(" ", ""), arg.k))
        test_file = os.path.join(
            arg.inf, "%s_test_%d" % (country.replace(" ", ""), arg.k))
        svm_twitter.load_data(train_file, test_file)
        svm_twitter.normalize()
        #svm_twitter.normalize()
        #svm_twitter.pca(arg.pca_num)
        svm_twitter.fit()
        svm_twitter.predict()

        for day in svm_twitter.novel_days:
            surrogate = {"country": country, "date": day.strftime("%Y-%m-%d")}
            send_queue.write(surrogate)
            surr_w.write(json.dumps(surrogate) + "\n")

        print "prediction result: %s " % country
        print[day.strftime("%Y-%m-%d") for day in svm_twitter.novel_days]
    surr_w.flush()
    surr_w.close()
    send_queue.close()
Ejemplo n.º 15
0
def attach_to_queue(index_name, queue_name, type_name=None, limit=None):
    """
    Attaches to the queue_name provided and inserts the messages into Elasticsearch
    :param index_name:
    :param queue_name:
    :param limit:
    :return:
    """
    queue.init()
    log.debug('Attempting to attach to the queue %s' % queue_name)
    with queue.open(name=queue_name, mode='r') as message_queue:
        if limit:
            batch_messages(iterable_obj=message_queue,
                           es_index_name=index_name,
                           es_type=type_name,
                           limit=limit)
        else:
            return push(iterable_obj=message_queue,
                        es_index_name=index_name,
                        es_type=type_name)
Ejemplo n.º 16
0
def main():
	# Initialize arguments
	argparser = args.get_parser()
	argparser.add_argument('--json_file', help='JSON file to publish', required=True)
	arg = argparser.parse_args()
	
	queue.init(arg)
	writer = queue.open(arg.pub, 'pub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel)
	
	try:
		msg_reader = codecs.open(arg.json_file, encoding='utf-8', mode='r')
		message = msg_reader.readline()
		while message:
			writer.write(json.loads(message))
			message = msg_reader.readline()
		
		msg_reader.close()
	except KeyboardInterrupt:
		pass
	
	return 0
Ejemplo n.º 17
0
def main():
    ap = args.get_parser()
    default_day = datetime.strftime(datetime.now(), "%Y-%m-%d")
    ap.add_argument("--d", type=str, default=default_day, help="The day to ingest, Format: dd/mm/yyyy")
    ap.add_argument("--domain", default="bloomberg_prices", help="The simpleDB table to store raw data")
    arg = ap.parse_args()

    assert arg.pub, "Need a queue to publish"
    logs.init(arg)
    queue.init(arg)

    with queue.open(arg.pub, "w") as out_q:
        for stock in STOCK_CON:
            if stock == "COLCAP":
                scrape_f = scrape_colcap_url
            if stock == "CHILE65":
                scrape_f = scrape_chile65_url
            msg = ingest_price(arg, stock, scrape_f)
            if msg is not None:
                out_q.write(msg)
                store(arg, msg)
def main():
    ap = args.get_parser()
    ap.add_argument('--level', type=str, default="0.6", help='The threhold')
    ap.add_argument('--svm', action='store_true')
    ap.add_argument('--zmq', action='store_true')
    ap.add_argument('--surr', type=str, help="surrogate file")
    ap.add_argument('--warn', type=str, help="warning file")
    arg = ap.parse_args()

    logs.init(arg)
    queue.init(arg)
    assert arg.pub, "Please input a queue to publish warning"
    if arg.zmq:
        assert arg.sub, "Please input a queue to sub surrogate message"
    conn = boto.connect_sdb()
    t_domain = get_domain(conn, "s_holiday")

    if arg.zmq:
        with queue.open(arg.sub, 'r') as inq:
            for m in inq:
                try:
                    if arg.svm:
                        svm_warning(t_domain, m, arg.pub)
                    else:
                        warning_center(t_domain, m, arg.pub, float(arg.level))
                except KeyboardInterrupt:
                    log.info('GOT SIGINIT, exiting!')
                    break
                except:
                    log.exception("Exception in Process:%s" %
                                  sys.exc_info()[0])
    else:
        with open(arg.warn, "w") as w, open(arg.surr) as r:
            if arg.svm:
                for m in r:
                    m = json.loads(m)
                    warning = svm_warning(t_domain, m, arg.pub)
                    w.write(json.dumps(warning) + "\n")
Ejemplo n.º 19
0
def main():
    svm_twitter = SVM_Twitter(0.1, 0.1, 'rbf')
    ap = args.get_parser()
    ap.add_argument("--pca_num", default=8, type=int)
    ap.add_argument("--net", type=str)
    ap.add_argument("--k", type=int)
    ap.add_argument("--inf", type=str, help="input folder")
    ap.add_argument("--o_surr", type=str, help="output surrogate file")
    arg = ap.parse_args()
    folder = {"t": "content", "c": "comprehend", "u": "user2user",
              "e": "entity"}

    assert arg.pub, "Please input a queue to publish surrogate"
    queue.init(arg)
    send_queue = queue.open(arg.pub, "w")
    surr_w = open(arg.o_surr, "w")
    for country in COUNTRY:
        train_file = os.path.join(arg.inf,
                                  "%s_train_%d" % (country.replace(" ", ""), arg.k))
        test_file = os.path.join(arg.inf,
                                 "%s_test_%d" % (country.replace(" ", ""), arg.k))
        svm_twitter.load_data(train_file, test_file)
        svm_twitter.normalize()
        #svm_twitter.normalize()
        #svm_twitter.pca(arg.pca_num)
        svm_twitter.fit()
        svm_twitter.predict()

        for day in svm_twitter.novel_days:
            surrogate = {"country": country, "date": day.strftime("%Y-%m-%d")}
            send_queue.write(surrogate)
            surr_w.write(json.dumps(surrogate)+ "\n")

        print "prediction result: %s " % country
        print [day.strftime("%Y-%m-%d") for day in svm_twitter.novel_days]
    surr_w.flush()
    surr_w.close()
    send_queue.close()
def main():
    ap = args.get_parser()
    ap.add_argument('--level', type=str, default="0.6",
                    help='The threhold')
    ap.add_argument('--svm', action='store_true')
    ap.add_argument('--zmq', action='store_true')
    ap.add_argument('--surr', type=str, help="surrogate file")
    ap.add_argument('--warn', type=str, help="warning file")
    arg = ap.parse_args()

    logs.init(arg)
    queue.init(arg)
    assert arg.pub, "Please input a queue to publish warning"
    if arg.zmq:
        assert arg.sub, "Please input a queue to sub surrogate message"
    conn = boto.connect_sdb()
    t_domain = get_domain(conn, "s_holiday")

    if arg.zmq:
        with queue.open(arg.sub, 'r') as inq:
            for m in inq:
                try:
                    if arg.svm:
                        svm_warning(t_domain, m, arg.pub)
                    else:
                        warning_center(t_domain, m, arg.pub, float(arg.level))
                except KeyboardInterrupt:
                    log.info('GOT SIGINIT, exiting!')
                    break
                except:
                    log.exception("Exception in Process:%s" % sys.exc_info()[0])
    else:
        with open(arg.warn, "w") as w, open(arg.surr) as r:
            if arg.svm:
                for m in r:
                    m = json.loads(m)
                    warning = svm_warning(t_domain, m, arg.pub)
                    w.write(json.dumps(warning) + "\n")
Ejemplo n.º 21
0
def main():
    ap = args.get_parser()
    ap.add_argument('--r_file', type=str, help="The rule file")
    ap.add_argument('--o', type=str, help="The output file")
    arg = ap.parse_args()

    assert arg.r_file, 'Need a rule file'
    assert arg.sub, 'Need a queue to subscribe'
    assert arg.o, 'Need a file to output'

    logs.init(arg)
    queue.init(arg)

    u_pattern = re.compile("http://(www\.){0,1}[^/]*/[a-z0-9/.\-]*(econ)[a-z0-9\.\-]*", flags=re.I)
    c_rule = create_label_rule(arg.r_file)
    g_rule = create_gold_lable(arg.r_file)
    c_pattern = re.compile(c_rule, flags=re.I)

    with queue.open(arg.sub, 'r') as q_r, codecs.open(arg.o, 'a') as f_a:
        for news in q_r:
            f_news = process(news, u_pattern, c_pattern, g_rule)
            if f_news is not None:
                f_a.write(json.dumps(f_news) + "\n")
                print f_news['date'], f_news['title'], "|", f_news['o_country'], "|", f_news["p_country"]
Ejemplo n.º 22
0
    rs = t_domain.select(sql)
    return rs

if __name__ == "__main__":
    ap = args.get_parser()
    ap.add_argument('--s_date', type=str, help="the start date of the query")
    ap.add_argument('--e_date', type=str, help='the end date of the query')
    ap.add_argument('--f', action='store_true', help='load enriched message from file')
    ap.add_argument('--sdb', action='store_true', help='load enriched message from simpledb')
    ap.add_argument('--file', type=str, help="the file location")
    arg = ap.parse_args()

    assert arg.pub, 'Need a queue to publish'

    logs.init(arg)
    queue.init(arg)
    if arg.sdb:
        conn = boto.connect_sdb()
        t_domain = conn.get_domain('t_enriched_bloomberg_prices')
        rs = get_enriched_prices(t_domain, arg.s_date, arg.e_date)

    if arg.f:
        with open(arg.file, "r") as r:
            rs = [eval(line.strip()) for line in r.readlines()]

    with queue.open(arg.pub, 'w') as q_w, open("surrogate.txt", "w") as s_w:
        for r in rs:
            print r
            q_w.write(r)
            s_w.write(json.dumps(r) + "\n")
Ejemplo n.º 23
0
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument(
        '--cat',
        action="store_true",
        help='Read input from standard in and write to standard out.')
    ap.add_argument('--region',
                    metavar='REGION',
                    type=str,
                    default=None,
                    help='Specify region to filter by')
    arg = ap.parse_args()
    logs.init(arg)
    filter_region = arg.region
    geoc = GeoCountry()
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = codecs.getwriter('utf-8')(sys.stdout)
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    tweet = annotate(tweet, geoc, filter_region)
                    if tweet is not None:
                        outs.write(json.dumps(tweet, ensure_ascii=False))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', entry)

        else:
            queue.init(arg)
            iqueue.init(arg)
            qname = "{}-geoCountry-{}".format(os.environ["CLUSTERNAME"],
                                              filter_region)
            with iqueue.open(arg.sub, 'r', qname=qname) as inq:
                with queue.open(arg.pub,
                                'w') as outq:  # , capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = annotate(tweet, geoc, filter_region)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".',
                                          tweet)

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{0!s}.".format(e))
        return 1
Ejemplo n.º 24
0
def main():
    global CONFIG,VOCABULARY_FILE,WARNING_PORT,SURROGATE_PORT,__version__,KEY_ID,SECRET,T_EASTERN,T_UTC
    
    "Initiate the TimeZone Setting"
    T_UTC = pytz.utc
    T_EASTERN = pytz.timezone("US/Eastern")
    
    "Get the input args"
    args = parse_args()
    rege_date = args.rege_date
    KEY_ID = args.key_id
    SECRET = args.secret
    logs.init(args)
    queue.init(args)
    #replace dbconnection to simple DB    
#   conn = lite.connect(db_file)
    conn = boto.connect_sdb(KEY_ID,SECRET)
    "if rege_date is not none, it means to regenerate the past day's prediction"
    if not rege_date:
        "Normal predict"
        predict_date = args.predict_date
        model_cfg = args.model_cfg
        WARNING_PORT = args.warning_port
        SURROGATE_PORT = args.surrogate_port
        
        stock_list = None
        if args.stock_list:
            stock_list = args.stock_list
        

        "Get the Latest version of Config Object"
        f = open(model_cfg,"r")
        configObj = json.load(f)
        f.close()
        con_versionNum = max([int(v) for v in configObj.keys()])
        CONFIG = configObj[str(con_versionNum)]
        
        
        "Get the Latest version of Trend Range Object"
        clusterTrends = json.load(sys.stdin)
        trend_versionNum = max([int(v) for v in clusterTrends.keys()])
        CONFIG["trendRange"] = {"version":str(trend_versionNum),"range":clusterTrends[str(trend_versionNum)]}
        
        if not stock_list:
            stock_list = CONFIG["stocks"]
            
        "Retrain the model configuration if current day is Saturday"
        weekDay = datetime.strptime(predict_date,"%Y-%m-%d").weekday()
        if weekDay == 5:
            finalClusterProbability,finalClusterMatrix = re_training(conn,predict_date,stock_list)
            new_config = json.loads(json.dumps(CONFIG))
            new_config["clusterProbability"] = finalClusterProbability
            new_config["clusterContribution"] = finalClusterMatrix
            "Write back to configure file"
            new_version_num = con_versionNum + 1
            new_config["version"] = new_version_num
            configObj[str(new_version_num)] = new_config
            with open(model_cfg,"w") as out_q:
                out_q.write(json.dumps(configObj))
        
        "Process stock each by each"
        for stock in stock_list:
            surrogate = process_single_stock(conn,predict_date,stock)
            if surrogate:
                warning = warning_check(conn,surrogate)
        
    else:
        "regenerate the old prediction"
        model_cfg = args.model_cfg
        stock_list = None
        if args.stock_list:
            stock_list = args.stock_list
            
        "Get the version of Config Object for the indicated prediction"
        versionObj = get_predicion_version(conn,rege_date)
        configVersionNum = versionObj["configVersion"]
        trendVersionNum = versionObj["trendVersion"]
        
        configObj = json.load(open(model_cfg))
        if configVersionNum in configObj:
            CONFIG = configObj[configVersionNum]
        else:
            CONFIG = configObj["1"]
        
        "Get the Latest version of Trend Range Object"
        clusterTrends = json.load(sys.stdin)
        CONFIG["trendRange"] = {"version":str(trendVersionNum),"range":clusterTrends[trendVersionNum]}
        
        if not stock_list:
            stock_list = CONFIG["stocks"]
        
        "Process stock each by each"
        for stock in stock_list:
            surrogate = process_single_stock(conn,rege_date,stock,True)
            if surrogate:
                warning = warning_check(conn,surrogate,True)
                return warning
        
    if conn:
        conn.close()
Ejemplo n.º 25
0
def main():
    global CONFIG, VOCABULARY_FILE, WARNING_PORT, SURROGATE_PORT, __version__, KEY_ID, SECRET, T_EASTERN, T_UTC

    "Initiate the TimeZone Setting"
    T_UTC = pytz.utc
    T_EASTERN = pytz.timezone("US/Eastern")

    "Get the input arg"
    arg = parse_args()
    rege_date = arg.rege_date
    KEY_ID = arg.aws_key
    SECRET = arg.aws_secret
    logs.init(arg)
    queue.init(arg)
    conn = boto.connect_sdb(KEY_ID, SECRET)
    #initiate simpleDB domains
    surrogateDomain = get_domain(conn, arg.surrogate_domain)
    warningDomain = get_domain(conn, arg.warning_domain)

    WARNING_PORT = arg.warning_port
    SURROGATE_PORT = arg.surrogate_port
    "if rege_date is not none, it means to regenerate the past day's prediction"
    if not rege_date:
        "Normal predict"
        predict_date = arg.predict_date
        model_cfg = arg.model_cfg

        stock_list = None
        if arg.stock_list:
            stock_list = arg.stock_list
        "Get the Latest version of Config Object"
        f = open(model_cfg, "r")
        configObj = json.load(f)
        f.close()
        con_versionNum = max([int(v) for v in configObj.keys()])
        CONFIG = configObj[str(con_versionNum)]
        "Get the Latest version of Trend Range Object"
        clusterTrends = json.load(sys.stdin)
        trend_versionNum = max([int(v) for v in clusterTrends.keys()])
        CONFIG["trendRange"] = {"version": str(trend_versionNum), "range": clusterTrends[str(trend_versionNum)]}

        if not stock_list:
            stock_list = CONFIG["stocks"]

        #"Retrain the model configuration if current day is Saturday"
        #weekDay = datetime.strptime(predict_date,"%Y-%m-%d").weekday()
        #if weekDay == 5:
        #    finalClusterProbability, finalClusterMatrix = re_training(surrogateDomain, predict_date, stock_list)
        #    new_config = json.loads(json.dumps(CONFIG))
        #    new_config["clusterProbability"] = finalClusterProbability
        #    new_config["clusterContribution"] = finalClusterMatrix
        #    "Write back to configure file"
        #    new_version_num = con_versionNum + 1
        #    new_config["version"] = new_version_num
        #    configObj[str(new_version_num)] = new_config
        #    with open(model_cfg, "w") as out_q:
        #        out_q.write(json.dumps(configObj))

        "Process stock each by each"
        for stock in stock_list:
            surrogate = process_single_stock(surrogateDomain, predict_date,stock)
            if surrogate:
                warning = warning_check(warningDomain, surrogate)

    else:
        "regenerate the old prediction"
        model_cfg = arg.model_cfg
        stock_list = None
        if arg.stock_list:
            stock_list = arg.stock_list

        "Get the version of Config Object for the indicated prediction"
        versionObj = get_predicion_version(warningDomain, rege_date)
        configVersionNum = versionObj["configVersion"]
        trendVersionNum = versionObj["trendVersion"]

        configObj = json.load(open(model_cfg))
        if configVersionNum in configObj:
            CONFIG = configObj[configVersionNum]
        else:
            CONFIG = configObj["1"]

        "Get the Latest version of Trend Range Object"
        clusterTrends = json.load(sys.stdin)
        #get the latest version of the warning
        tmpVersion = int(trendVersionNum)
        while tmpVersion >= 1:
            if str(tmpVersion) in clusterTrends:
                trendVersionNum = str(tmpVersion)
                break
            else:
                tmpVersion -= 1

        CONFIG["trendRange"] = {"version": str(trendVersionNum), "range": clusterTrends[trendVersionNum]}

        if not stock_list:
            stock_list = CONFIG["stocks"]

        "Process stock each by each"
        for stock in stock_list:
            replayIO = StringIO.StringIO()
            surrogate = process_single_stock(surrogateDomain, rege_date, stock, True, replayIO)
            if surrogate:
                warning = warning_check(warningDomain, surrogate, True, replayIO)
            replayInfo = replayIO.getvalue()
            weid = getwarningeid(surrogateDomain, rege_date, stock)
            with open("./demo/%s.txt" % weid, "w") as win:
                win.write(replayInfo)

    if conn:
        conn.close()
Ejemplo n.º 26
0
def main():
    """
    Utility to cache messages from a queue into Elasticsearch
    -q | --queue   : Read from <queue> and write the messages to Elasticsearch. Settings are read from embers.conf
    --log_file     : Path to write the log file to
    --log_level    : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('-q', '--queue', help='Queue name to index into Elasticsearch')
    arg_parser.add_argument('-s', '--s3fromq', action='store_true', help='ingest from S3 prefix derived from queue name')
    arg_parser.add_argument('-p', '--prefix', help='Ingest from prefix')
    #arg_parser.add_argument('-t', '--typename', default='noqueue', help='Type for prefix ingest')
    arg_parser.add_argument('-t', '--typename', help='Type for prefix ingest')
    arg_parser.add_argument('-l', '--tmpcopy', default='/home/embers/data/tmpcopy',help='Name of local copy of S3 file (same for all S3 files)')
    arg_parser.add_argument('-c', '--chunk', type=int, default=100,help='Chunk size for S3 ingest')
    arg_parser.add_argument('-i', '--clustername', help='Clustername to determine index name')
    arg_parser.add_argument('-w', '--withbase', action="store_true", help="Add basename to prefix when looking for type.")
    arg_parser.add_argument('--startdate', help='start date in format like 2015-01-02')
    arg_parser.add_argument('--enddate', help='end date in format like 2015-01-02')
    arg = arg_parser.parse_args()

    #assert (arg.queue or (arg.prefix and arg.typename)), 'Either --queue (with optional --s3fromq/--typename) or --prefix with --typename must be provided'
    assert (arg.queue or arg.prefix ), 'Either --queue (with optional --s3fromq/--typename) or --prefix  must be provided'

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    index_name = general.get_index_name(arg.clustername)

    queue.init()

    if arg.prefix or (arg.queue and arg.s3fromq):
        if arg.prefix:
            prefix = arg.prefix
            # get queue name or its substitute for S3 objects from prefix
            if arg.typename:
                type_name = arg.typename
            else:
                type_name = queue.conf.get_prefixpair(prefix=prefix,includeS3=True,withBasename=arg.withbase)
                if not type_name:
                    log.error("Could not get type from prefix %s" % prefix)
                    return 1
                log.warning("type_name=%s from prefix=%s" % (type_name, prefix))
        else:
            type_name = arg.queue
            prefix, include = queue.conf.get_prefix_for_queue(type_name, withBasename=False)
            if not prefix:
                log.error("Could not get S3 prefix for queue %s" % type_name)
                return 1

        if not general.get_es_connection().indices.exists_type(index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        conn_s3 = boto.connect_s3(aws_access_key_id=arg.aws_key, aws_secret_access_key=arg.aws_secret)
        bucket = conn_s3.get_bucket(arg.bucket)	 # connect to S3, get bucket ptr for arg.bucket
        attach_to_s3(index_name, 
                     s3prefix=prefix, 
                     bucket=bucket, 
                     type_name=type_name, 
                     tmpcopy=arg.tmpcopy, 
                     chunk_size=arg.chunk,
                     startdate=arg.startdate,
                     enddate=arg.enddate)
    else:

        if arg.typename:
            type_name=arg.typename
        else:
            type_name=arg.queue

        if not general.get_es_connection().indices.exists_type(index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        attach_to_queue(index_name=index_name, queue_name=arg.queue, type_name=type_name)
Ejemplo n.º 27
0
def main():
    """
    Utility to cache messages from a queue into Elasticsearch
    -q | --queue   : Read from <queue> and write the messages to Elasticsearch. Settings are read from embers.conf
    --log_file     : Path to write the log file to
    --log_level    : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('-q',
                            '--queue',
                            help='Queue name to index into Elasticsearch')
    arg_parser.add_argument(
        '-s',
        '--s3fromq',
        action='store_true',
        help='ingest from S3 prefix derived from queue name')
    arg_parser.add_argument('-p', '--prefix', help='Ingest from prefix')
    #arg_parser.add_argument('-t', '--typename', default='noqueue', help='Type for prefix ingest')
    arg_parser.add_argument('-t', '--typename', help='Type for prefix ingest')
    arg_parser.add_argument(
        '-l',
        '--tmpcopy',
        default='/home/embers/data/tmpcopy',
        help='Name of local copy of S3 file (same for all S3 files)')
    arg_parser.add_argument('-c',
                            '--chunk',
                            type=int,
                            default=100,
                            help='Chunk size for S3 ingest')
    arg_parser.add_argument('-i',
                            '--clustername',
                            help='Clustername to determine index name')
    arg_parser.add_argument(
        '-w',
        '--withbase',
        action="store_true",
        help="Add basename to prefix when looking for type.")
    arg_parser.add_argument('--startdate',
                            help='start date in format like 2015-01-02')
    arg_parser.add_argument('--enddate',
                            help='end date in format like 2015-01-02')
    arg = arg_parser.parse_args()

    #assert (arg.queue or (arg.prefix and arg.typename)), 'Either --queue (with optional --s3fromq/--typename) or --prefix with --typename must be provided'
    assert (
        arg.queue or arg.prefix
    ), 'Either --queue (with optional --s3fromq/--typename) or --prefix  must be provided'

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    index_name = general.get_index_name(arg.clustername)

    queue.init()

    if arg.prefix or (arg.queue and arg.s3fromq):
        if arg.prefix:
            prefix = arg.prefix
            # get queue name or its substitute for S3 objects from prefix
            if arg.typename:
                type_name = arg.typename
            else:
                type_name = queue.conf.get_prefixpair(
                    prefix=prefix, includeS3=True, withBasename=arg.withbase)
                if not type_name:
                    log.error("Could not get type from prefix %s" % prefix)
                    return 1
                log.warning("type_name=%s from prefix=%s" %
                            (type_name, prefix))
        else:
            type_name = arg.queue
            prefix, include = queue.conf.get_prefix_for_queue(
                type_name, withBasename=False)
            if not prefix:
                log.error("Could not get S3 prefix for queue %s" % type_name)
                return 1

        if not general.get_es_connection().indices.exists_type(
                index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        conn_s3 = boto.connect_s3(aws_access_key_id=arg.aws_key,
                                  aws_secret_access_key=arg.aws_secret)
        bucket = conn_s3.get_bucket(
            arg.bucket)  # connect to S3, get bucket ptr for arg.bucket
        attach_to_s3(index_name,
                     s3prefix=prefix,
                     bucket=bucket,
                     type_name=type_name,
                     tmpcopy=arg.tmpcopy,
                     chunk_size=arg.chunk,
                     startdate=arg.startdate,
                     enddate=arg.enddate)
    else:

        if arg.typename:
            type_name = arg.typename
        else:
            type_name = arg.queue

        if not general.get_es_connection().indices.exists_type(
                index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        attach_to_queue(index_name=index_name,
                        queue_name=arg.queue,
                        type_name=type_name)