Esempio n. 1
0
def main():
    global log
    ap = args.get_parser()
    ap.add_argument("--s_date", type=str, help="the start date to ingest: format mmddyyyy")
    ap.add_argument("--e_date", type=str, help="the end of date to ingest: format mmddyyyy")
    ap.add_argument("--o", type=str, help="the output directory")
    ap.add_argument("--region", type=str, help="the region of the web site")
    arg = ap.parse_args()
    logs.init(arg)

    t_format = "%m%d%Y"
    s_date = datetime.strptime(arg.s_date, t_format)
    e_date = datetime.strptime(arg.e_date, t_format)
    d_delta = (e_date - s_date).days

    seen_it = shelve.open("%s_reuters_news_seen_it.db" % (arg.region))
    i = 0
    while i <= d_delta:
        day_str = datetime.strftime(s_date + timedelta(days=i), t_format)
        print "Extracting %s" % (day_str)
        "write news to day file"
        with open("%s%s%s_ita_reuters_%s.txt" % (arg.o, os.sep, day_str, arg.region), "w") as w:
            daily_news = get_daily_news(day_str, seen_it, arg.region)
            for news in daily_news:
                w.write(json.dumps(news) + "\n")
        i += 1
Esempio n. 2
0
def main():
    ap = args.get_parser()
    ap.add_argument('--rev_file', metavar='FILE', type=str, required=False,
                     help='File which stores last revision id.', default="last_rev_file.txt")
    arg = ap.parse_args()
    logs.init(arg)
    log.info("Run Started")

    api = API()
    recent_changes = api.get_recent_changes()

    # Filter to only revisions newer than last run
    last_rev_file = arg.rev_file
    if os.path.exists(last_rev_file):
        with open(last_rev_file) as f:
            last_rev = int(f.read())
            recent_changes = [change for change in recent_changes if change['revid'] > last_rev]

    recently_changed_page_ids = set(str(change['pageid']) for change in recent_changes)
    latest_revisions = api.get_latest_revision(recently_changed_page_ids)
    print(len(latest_revisions))
    print(latest_revisions)
    # TODO actually do something with revisions
    with open(last_rev_file, mode='w') as f:
        f.write(str(max([int(page['revisions'][0]['revid']) for page in latest_revisions.values()])))
Esempio n. 3
0
def main():
    """
    Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf
    --hostname  : Cache all active queues on this host
    --log_file  : Path to write the log file to
    --log_level : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('--hostname', metavar='HOSTNAME', type=str, default=environ.get('HOSTNAME', None),
                            help="The hostname of the machine whose services' data you wish to cache")
    arg = arg_parser.parse_args()

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)
    conf.init(arg)

    assert arg.hostname, '--hostname must be provided'
    queues = conf.get_all_cached_queues(hostname=arg.hostname)
    pool = []

    for queue in queues:
        log.info('Spawning cache process for %s' % queue)
        p = multiprocessing.Process(name=queue, target=cache_queue, args=(queue,))
        p.start()
        pool.append(p)

    try:
        for process in pool:
            process.join()
            log.warn('%s caching has stopped' % process.name)
    except KeyboardInterrupt:
        log.warn('Keyboard interrupt in main')
Esempio n. 4
0
def main():
    ap = args.get_parser()
    ap.add_argument('--test', action="store_true", help="Test Flag, if contain this argument, it means a test case")
    arg = ap.parse_args()

    assert arg.sub, 'Need a queue to subscribe to'
    assert arg.pub, 'Need a queue to publish to'

    logs.init(arg)
    queue.init(arg)
    test_flag = arg.test

    conn = boto.connect_sdb()

    with queue.open(arg.sub, 'r') as inq:
        for m in inq:
            try:
                durationProcess(conn, m, arg.pub, test_flag)
            except KeyboardInterrupt:
                log.info('GOT SIGINT, exiting!')
                break
            except EmbersException as e:
                log.exception(e.value)
            except:
                log.exception("Unexpected exception in process")
Esempio n. 5
0
def init():
    global con
    global cur
    
    con = common.getDBConnection()
    cur = con.cursor()
    logs.init()
Esempio n. 6
0
def main():
    ap = args.get_parser()
    ap.add_argument('-i', '--inputFolder', type=str,
                    help='inputFolder contaning twitter files',
                    default='/hdd/tweets/2012/may')
    ap.add_argument('-s', '--scoresFolder', type=str,
                    help='Folder contaning scoreCards',
                    default='../data/scores/MX/')
    ap.add_argument('-cf', '--configFile', type=str,
                    help='election configuration file',
                    default='../configFiles/electionConfig_MX')
    ap.add_argument('-d1', '--fromDate', type=str,
                    help='fromDate')
    ap.add_argument('-d2', '--toDate', type=str,
                    help='toDate')
    ap.add_argument('-f1', '--flag1', help="countOrPredict",
                    type=str, default='2')
    ap.add_argument('-r', '--regression', help="regressionType",
                    type=str, default='LASSO')
    ap.add_argument('-f2', '--flag2', help="flag to push surrogates and warning to S3",
                    type=str, default='0')
    arg = ap.parse_args()
    logs.init(arg)

    try:
        elections = Elections(arg.inputFolder, arg.scoresFolder,
                              arg.configFile, arg.fromDate, arg.toDate)
        log.info("Election class initialized")
    except Exception as e:
        log.exception("exception during intialization: %s. Quitting!!", e)

    try:
        if (arg.flag1 == '1' or arg.flag1 == '3'):
            elections.collectMentions()
    except Exception as e:
        log.exception("error while tracking tweets")

    try:
        if (arg.flag1 == '2' or arg.flag1 == '3'):
            winner, winningScore, runnerUp, runnerUpScore, finalScore = elections.getWinner(arg.fromDate, arg.toDate, arg.regression)
            print "------------Regression Results-----------"
            print finalScore
            print winner + "====>" + str(winningScore)
            print "-----------------------------------------"
    except Exception as e:
        log.exception("error while calculating winner:%s", e)

    try:
        elections.createSurrogate(winner, winningScore, runnerUp, runnerUpScore, arg.flag2)
    except Exception as e:
        log.exception("error during creating warnings")

    try:
        if (arg.flag2 == '1'):
            elections.storeStatistics(arg.fromDate, arg.toDate)
    except Exception as e:
        log.exception("error in storing statistics:%s", e)

    log.info("ALL Operations Complete")
Esempio n. 7
0
def main():
	# Initialize arguments
	argparser = args.get_parser()
	argparser.add_argument('--local_port', help='Local port to connect to java server', required=True)
	arg = argparser.parse_args()
		
	localPort = int(arg.local_port)

	# Initialize log
	logs.init(arg)
	global log
	
	# Initialize the queue with arguments and connect to the specified feed
	log.info("Opening and connecting to queue %s", arg.sub)
	queue.init(arg)
	reader = queue.open(arg.sub, 'sub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel)
	
	# Initialize the writer to publish to a queue
	log.info("Publishing to queue %s", arg.pub)
	writer = queue.open(arg.pub, 'pub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel)
	

	count = 0
	# Connect to Java server
	while True:
		for feedmsg in reader:
			try:
				while True:
					try:
						sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
						sock.connect(("localhost", localPort))
						break
					except:
						log.info("Unable to connect to local server")

				log.debug("Connected to java server on port %d" % localPort)

				socketLines = sock.makefile()

				# Clean the message to fix irregularities
				feedmsg = message.clean(feedmsg)

				log.debug("Read message %d. Sending to java" % count)
				# Write message to socket stream
				sock.sendall(json.dumps(feedmsg))
				sock.sendall('\n')

				# Receive result from socket stream
				result = socketLines.readline()
				writer.write(json.dumps(result))
				count += 1

				sock.close()
			except KeyboardInterrupt:
				sys.exit(1)
			else:
				log.info("Server was disconnected.")
Esempio n. 8
0
def main():
    #initiate parameters
    global TREND_RANGE
    "Initiate the TimeZone Setting"

    arg = parse_args()
    conn = boto.connect_sdb()
    operate_date = arg.operate_date
    start_date = arg.start_date
    end_date = arg.end_date

    port = arg.pub
    assert port, "Need a queue to publish to"

    logs.init(arg)
    queue.init(arg)

    t_domain = get_domain(conn, 't_enriched_bloomberg_prices')

    #trend_file = args.trend_file
    # "Load the trend changeType range file"
    trendObject = None
    trendObject = json.load(sys.stdin)

    # "Get the latest version of Trend Ranage"
    trend_versionNum = max([int(v) for v in trendObject.keys()])
    # "To avoid changing the initiate values, we first transfer the json obj to string ,then load it to create a news object"
    TREND_RANGE = json.loads(json.dumps(trendObject[str(trend_versionNum)]))

    # "If input a date range, then we will handle all the data query from those days"
    if start_date is None:
        #get raw price list
        raw_price_list = []
        rs = get_raw_data(conn, operate_date)
        for r in rs:
            raw_price_list.append(r)
        for raw_data in raw_price_list:
            process(t_domain, port, raw_data)
    else:
        t_format = "%Y-%m-%d"
        s_date = datetime.strptime(start_date, t_format)
        e_date = datetime.strptime(end_date, t_format)
        while s_date <= e_date:
            raw_price_list = []
            rs = get_raw_data(conn, datetime.strftime(s_date, t_format))
            for r in rs:
                raw_price_list.append(r)
            for raw_data in raw_price_list:
                process(t_domain, port, raw_data)
            s_date = s_date + timedelta(days=1)
            # "sleep 5 s to wait simpleDB to commit"
            time.sleep(5)

    #"Write back the trendFile"
    new_version_num = trend_versionNum + 1
    trendObject[str(new_version_num)] = TREND_RANGE
    json.dump(trendObject, sys.stdout)
Esempio n. 9
0
def main():
    logs.init(l=logs.DEBUG)
    # TODO translate non-English characters
    # print(get_tweets(keywords='esta'))
    # print(aggregate_tweets_over_time())
    # print(aggregate_tweets_by_country())
    # print(aggregate_tweets_by_country(lemmas='bill'))
    # print(aggregate_tweets_by_country(country='brazil', lemmas='bill'))
    # print(aggregate_tweets_by_country(country='brazil', lemmas='bill', entities=''))

    print(get_abbreviated_tweets_for_dqe('brazil', 'datasift-keyword', start_date='2015-03-27', end_date='2015-03-27'))
Esempio n. 10
0
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument(
        '--cat',
        action="store_true",
        help='Read input from standard in and write to standard out.')
    arg = ap.parse_args()
    logs.init(arg)
    geo_mena = GeoMena()
    geo_lac = Geo(geo_region=GEO_REGION.lac)
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = sys.stdout
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    geo_annotate(tweet, geo_mena, geo_lac)
                    if tweet is not None:
                        outs.write(
                            json.dumps(tweet,
                                       ensure_ascii=False).encode("utf-8"))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', (entry, ))

        else:
            queue.init(arg)
            with queue.open(arg.sub, 'r') as inq:
                with queue.open(arg.pub, 'w', capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = geo_annotate(tweet, geo_mena, geo_lac)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".',
                                          (tweet, ))

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{}".format(str(e)))
        return 1
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument('--cat', action="store_true",
                    help='Read input from standard in and write to standard out.')
    ap.add_argument('--region', metavar='REGION', type=str, default=None,
                    help='Specify region to filter by')
    arg = ap.parse_args()
    logs.init(arg)
    filter_region = arg.region
    geoc = GeoCountry()
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = codecs.getwriter('utf-8')(sys.stdout)
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    tweet = annotate(tweet, geoc, filter_region)
                    if tweet is not None:
                        outs.write(json.dumps(tweet, ensure_ascii=False))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', entry)

        else:
            queue.init(arg)
            iqueue.init(arg)
            qname = "{}-geoCountry-{}".format(os.environ["CLUSTERNAME"], filter_region)
            with iqueue.open(arg.sub, 'r', qname=qname) as inq:
                with queue.open(arg.pub, 'w') as outq:  # , capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = annotate(tweet, geoc, filter_region)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".', tweet)

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{0!s}.".format(e))
        return 1
Esempio n. 12
0
def main():
    ap = args.get_parser()
    ap.add_argument('--replay', action="store_true", help="Test Flag, if contain this argument, it means a test case")
    #if the rule file is not indicated in argument, it need to be load from sys.stdin
    ap.add_argument('--rulefile', type=str, help="The rule file for duration analysis model")
    arg = ap.parse_args()

    if not arg.replay:
        assert arg.sub, 'Need a queue to subscribe to'
    assert arg.pub, 'Need a queue to publish to'

    logs.init(arg)
    queue.init(arg)
    test_flag = arg.replay
    if arg.rulefile:
        rule = eval(open(arg.rulefile).read())
    else:
        #load the rules from sys.stdin
        rule = eval(sys.stdin.read())

    conn = boto.connect_sdb()

    if not arg.replay:
        with queue.open(arg.sub, 'r') as inq:
            for m in inq:
                try:
                    replayIO = StringIO.StringIO()
                    durationProcess(rule, conn, m, arg.pub, test_flag, replayIO)
                except KeyboardInterrupt:
                    log.info('GOT SIGINT, exiting!')
                    break
                except EmbersException as e:
                    log.exception(e.value)
                except:
                    log.exception("Unexpected exception in process")
    else:
        #replay model take enriched file as input
        enrich_messages = sys.stdin.readlines()
        for m in enrich_messages:
            m = json.loads(m.strip())
            try:
                replayIO = StringIO.StringIO()
                durationProcess(rule, conn, m, arg.pub, test_flag, replayIO)
            except KeyboardInterrupt:
                log.info('GOT SIGINT, exiting!')
                break
            except EmbersException as e:
                log.exception(e.value)
            except:
                log.exception("Unexpected exception in process")
Esempio n. 13
0
def main():
    ap = args.get_parser()
    ap.add_argument('--o', type=str, help="the output dir to store news")
    arg = ap.parse_args()
    
    assert arg.o, 'Need a dir to store news'
    logs.init(arg)
    locale.setlocale(locale.LC_TIME, 'es_ES.utf-8')
    
    seen_it = shelve.open('elfinance_seen_it.db')
    
    cas = ['finanzas']
    for ca in cas:
        get_category_news(ca, seen_it, arg.o)
Esempio n. 14
0
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument('--cat', action="store_true",
                    help='Read input from standard in and write to standard out.')
    arg = ap.parse_args()
    logs.init(arg)
    geo_mena = GeoMena()
    geo_lac = Geo(geo_region=GEO_REGION.lac)
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = sys.stdout
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    geo_annotate(tweet, geo_mena, geo_lac)
                    if tweet is not None:
                        outs.write(json.dumps(tweet, ensure_ascii=False).encode("utf-8"))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', (entry,))

        else:
            queue.init(arg)
            with queue.open(arg.sub, 'r') as inq:
                with queue.open(arg.pub, 'w', capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = geo_annotate(tweet, geo_mena, geo_lac)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".', (tweet,))

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{}".format(str(e)))
        return 1
Esempio n. 15
0
def main():
    logs.init(l=logs.DEBUG)
    # TODO translate non-English characters
    # print(get_tweets(keywords='esta'))
    # print(aggregate_tweets_over_time())
    # print(aggregate_tweets_by_country())
    # print(aggregate_tweets_by_country(lemmas='bill'))
    # print(aggregate_tweets_by_country(country='brazil', lemmas='bill'))
    # print(aggregate_tweets_by_country(country='brazil', lemmas='bill', entities=''))

    print(
        get_abbreviated_tweets_for_dqe('brazil',
                                       'datasift-keyword',
                                       start_date='2015-03-27',
                                       end_date='2015-03-27'))
Esempio n. 16
0
def main():
    ap = args.get_parser()
    ap.add_argument('--f', type=str, help='the newes file')

    arg = ap.parse_args()

    assert arg.f, 'Need a file to ingest'
    assert arg.pub, 'Need a queue to publish'

    logs.init(arg)
    queue.init(arg)

    with queue.open(arg.pub, 'w') as q_w, open(arg.f, 'r') as f_r:
        for line in f_r:
            news = json.loads(line)
            q_w.write(news)
Esempio n. 17
0
def main():
    """
    Utility for warnings stored in Elasticsearch
    --log_file     : Path to write the log file to
    --log_level    : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg = arg_parser.parse_args()

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    print(query(max_results=30))
def initiate():
    global newsAlreadyDownload
    global companyListDir
    global port
    global newsAlreadDownloadFilePath
    global companyList
    global dailyNewsOutPath
    
    args = parse_args()
    logs.init()
    
    newsAlreadDownloadFilePath = args.f_downloaded 
    companyListDir = args.f_company_list
    port = args.port
    dailyNewsOutPath = args.f_out
    
    newsAlreadyDownload = json.load(open(newsAlreadDownloadFilePath))
    companyList = json.load(open(companyListDir))
Esempio n. 19
0
def main():
    ap = args.get_parser()
    ap.add_argument('--out', help="the output file of warnings")
    arg = ap.parse_args()

    assert arg.sub, 'Need a queue to subcribe!'
    assert arg.out, 'Need a file to store warnings!'

    logs.init(arg)
    queue.init(arg)
    out_file = arg.out

    with queue.open(arg.sub, 'r') as q_r:
        for m in q_r:
            with open(out_file, "a") as out_w:
                if not check_ifexist(m):
                    out_w.write(json.dumps(m) + "\n")
                else:
                    print "Duplicated Warnings"
Esempio n. 20
0
def main():
    ap = args.get_parser()
    ap.add_argument('--out', help="the output file of warnings")
    arg = ap.parse_args()

    assert arg.sub, 'Need a queue to subcribe!'
    assert arg.out, 'Need a file to store warnings!'

    logs.init(arg)
    queue.init(arg)
    out_file =  arg.out

    with queue.open(arg.sub, 'r') as q_r:
        for m in q_r:
            with open(out_file, "a") as out_w:
                if not check_ifexist(m):
                    out_w.write(json.dumps(m) + "\n")
                else:
                    print "Duplicated Warnings"
Esempio n. 21
0
def main():
    ap = args.get_parser()
    ap.add_argument('-i', '--input', default='sys.stdin', type=str, help='Path to the input file.'
                    'Default is sys.stdin')
    ap.add_argument('-o', '--out', default='sys.stdout', type=str, help='Path to the output file.'
                    'Default is sys.stdout')
    ap.add_argument('searchPhrase', default='config/phrases.txt', type=str, help='Path to '
                    'the Phrase File if "-f" flag is specified, else the input string is considered'
                    'to be the phrase.')
    ap.add_argument('-f', '--file', action='store_true', default=False, help='If given, then the '
                    'the searchPhrase argument is interpreted as path to a file')
    global logger
    logger = logs.getLogger("%s-%s.log" % (__processor__, str(datetime.now())))
    arg = ap.parse_args()
    logs.init(args)
    inputFile = None
    outFile = None
    phraseFile = None

    if arg.input == 'sys.stdin':
        reader = codecs.getreader('utf-8')(sys.stdin)
    else:
        inputFile = open(arg.input, "r")
        reader = codecs.getreader('utf-8')(inputFile)
    if arg.out == 'sys.stdout':
        writer = codecs.getwriter('utf-8')(sys.stdout)
    else:
        outFile = codecs.open(arg.out, "w", encoding="utf-8")
        writer = codecs.getwriter('utf-8')(outFile)
    if arg.file:
        phraseFile = codecs.open(arg.searchPhrase, encoding='utf-8')
        generatePhraseList(phraseFile.readlines())
    else:
        generatePhraseList([arg.searchPhrase])
    phraseSearch(reader, writer)
    #close all files
    if inputFile:
        inputFile.close()
    if outFile:
        outFile.close()
    if phraseFile:
        phraseFile.close()
Esempio n. 22
0
def main():
    """
    Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf
    --hostname  : Cache all active queues on this host
    --log_file  : Path to write the log file to
    --log_level : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument(
        '--hostname',
        metavar='HOSTNAME',
        type=str,
        default=environ.get('HOSTNAME', None),
        help=
        "The hostname of the machine whose services' data you wish to cache")
    arg = arg_parser.parse_args()

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)
    conf.init(arg)

    assert arg.hostname, '--hostname must be provided'
    queues = conf.get_all_cached_queues(hostname=arg.hostname)
    pool = []

    for queue in queues:
        log.info('Spawning cache process for %s' % queue)
        p = multiprocessing.Process(name=queue,
                                    target=cache_queue,
                                    args=(queue, ))
        p.start()
        pool.append(p)

    try:
        for process in pool:
            process.join()
            log.warn('%s caching has stopped' % process.name)
    except KeyboardInterrupt:
        log.warn('Keyboard interrupt in main')
Esempio n. 23
0
def main():
    """
    Utility to set up a mapping for an EMBERS queue in Elasticsearch
    -q | --queue     : Queue name to set up the mapping for. Settings are read from embers.conf
    --log_file  : Path to write the log file to
    --log_level : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('-q', '--queue', help='Queue name to map into Elasticsearch')
    arg = arg_parser.parse_args()

    assert arg.queue, '--queue must be provided'

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    add_type(index_name=general.get_index_name(), type_name=arg.queue)
Esempio n. 24
0
def main():
    ap = args.get_parser()
    default_day = datetime.strftime(datetime.now(), "%Y-%m-%d")
    ap.add_argument("--d", type=str, default=default_day, help="The day to ingest, Format: dd/mm/yyyy")
    ap.add_argument("--domain", default="bloomberg_prices", help="The simpleDB table to store raw data")
    arg = ap.parse_args()

    assert arg.pub, "Need a queue to publish"
    logs.init(arg)
    queue.init(arg)

    with queue.open(arg.pub, "w") as out_q:
        for stock in STOCK_CON:
            if stock == "COLCAP":
                scrape_f = scrape_colcap_url
            if stock == "CHILE65":
                scrape_f = scrape_chile65_url
            msg = ingest_price(arg, stock, scrape_f)
            if msg is not None:
                out_q.write(msg)
                store(arg, msg)
Esempio n. 25
0
def main():
    ap = args.get_parser()
    ap.add_argument('-c', '--conf', metavar='CONF', type=str, nargs='?', 
                    default=os.path.join(os.path.dirname(__file__), 'bloomberg_news_ingest.conf'),
                    help='The location of the configuration file.')
    arg = ap.parse_args()
    assert arg.pub, "--pub required. Need a queue to publish on"

    logs.init(arg)
    conf = get_conf(arg.conf)
    seen_it = shelve.open("bloomberg_news_seen_it.db")
    
    try:
        with queue.open(arg.pub, 'w', capture=True) as outq:
            for (index, companies) in conf.items():
                for company in companies:
                    articles = get_stock_news(index, company, seen_it)
                    for a in articles:
                        outq.write(a)

    except KeyboardInterrupt:
        log.info('GOT SIGINT, exiting')
def main():
    ap = args.get_parser()
    ap.add_argument('--level', type=str, default="0.6", help='The threhold')
    ap.add_argument('--svm', action='store_true')
    ap.add_argument('--zmq', action='store_true')
    ap.add_argument('--surr', type=str, help="surrogate file")
    ap.add_argument('--warn', type=str, help="warning file")
    arg = ap.parse_args()

    logs.init(arg)
    queue.init(arg)
    assert arg.pub, "Please input a queue to publish warning"
    if arg.zmq:
        assert arg.sub, "Please input a queue to sub surrogate message"
    conn = boto.connect_sdb()
    t_domain = get_domain(conn, "s_holiday")

    if arg.zmq:
        with queue.open(arg.sub, 'r') as inq:
            for m in inq:
                try:
                    if arg.svm:
                        svm_warning(t_domain, m, arg.pub)
                    else:
                        warning_center(t_domain, m, arg.pub, float(arg.level))
                except KeyboardInterrupt:
                    log.info('GOT SIGINIT, exiting!')
                    break
                except:
                    log.exception("Exception in Process:%s" %
                                  sys.exc_info()[0])
    else:
        with open(arg.warn, "w") as w, open(arg.surr) as r:
            if arg.svm:
                for m in r:
                    m = json.loads(m)
                    warning = svm_warning(t_domain, m, arg.pub)
                    w.write(json.dumps(warning) + "\n")
def main():
    ap = args.get_parser()
    ap.add_argument('--level', type=str, default="0.6",
                    help='The threhold')
    ap.add_argument('--svm', action='store_true')
    ap.add_argument('--zmq', action='store_true')
    ap.add_argument('--surr', type=str, help="surrogate file")
    ap.add_argument('--warn', type=str, help="warning file")
    arg = ap.parse_args()

    logs.init(arg)
    queue.init(arg)
    assert arg.pub, "Please input a queue to publish warning"
    if arg.zmq:
        assert arg.sub, "Please input a queue to sub surrogate message"
    conn = boto.connect_sdb()
    t_domain = get_domain(conn, "s_holiday")

    if arg.zmq:
        with queue.open(arg.sub, 'r') as inq:
            for m in inq:
                try:
                    if arg.svm:
                        svm_warning(t_domain, m, arg.pub)
                    else:
                        warning_center(t_domain, m, arg.pub, float(arg.level))
                except KeyboardInterrupt:
                    log.info('GOT SIGINIT, exiting!')
                    break
                except:
                    log.exception("Exception in Process:%s" % sys.exc_info()[0])
    else:
        with open(arg.warn, "w") as w, open(arg.surr) as r:
            if arg.svm:
                for m in r:
                    m = json.loads(m)
                    warning = svm_warning(t_domain, m, arg.pub)
                    w.write(json.dumps(warning) + "\n")
Esempio n. 28
0
def main():
    ap = args.get_parser()
    ap.add_argument('--r_file', type=str, help="The rule file")
    ap.add_argument('--o', type=str, help="The output file")
    arg = ap.parse_args()

    assert arg.r_file, 'Need a rule file'
    assert arg.sub, 'Need a queue to subscribe'
    assert arg.o, 'Need a file to output'

    logs.init(arg)
    queue.init(arg)

    u_pattern = re.compile("http://(www\.){0,1}[^/]*/[a-z0-9/.\-]*(econ)[a-z0-9\.\-]*", flags=re.I)
    c_rule = create_label_rule(arg.r_file)
    g_rule = create_gold_lable(arg.r_file)
    c_pattern = re.compile(c_rule, flags=re.I)

    with queue.open(arg.sub, 'r') as q_r, codecs.open(arg.o, 'a') as f_a:
        for news in q_r:
            f_news = process(news, u_pattern, c_pattern, g_rule)
            if f_news is not None:
                f_a.write(json.dumps(f_news) + "\n")
                print f_news['date'], f_news['title'], "|", f_news['o_country'], "|", f_news["p_country"]
Esempio n. 29
0
def main():
    logs.init(l=logs.DEBUG)
    print(get_data(sources=['Warnings']))
Esempio n. 30
0
 def __init__(self,cfgPath):
     common.init(cfgPath)
     logs.init()
Esempio n. 31
0
def execute(arg):
    logs.init(arg)

    fromDate = datetime.strptime(arg.fromDate, "%d %b %Y")
    toDate = datetime.strptime(arg.toDate, "%d %b %Y")
    tweetFolder = arg.tweetFolder
    country = arg.country

    hashTagCounts = {}
    uids = {}

    # loading twitter handles from a file
    with open(arg.seedFile, 'r') as _file:
        for line in _file:
            handle, candidate = line.strip().split(',')
            if candidate not in uids:
                uids[candidate] = []
                hashTagCounts[candidate] = {}
                uids[candidate].append(handle.lower())
            else:
                uids[candidate].append(handle.lower())

    # for geolocation
    geo = Geo()

    for _file in sorted(os.listdir(tweetFolder)):
        fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d')
        if (fileDate >= fromDate and fileDate < toDate):
            log.info("processing file %s" % (_file))
            try:
                with open(tweetFolder + "/" + _file, "r") as FILE:
                    for line in FILE:
                        try:
                            jsonTweet = json.loads(line.strip())
                            dateStr = jsonTweet['interaction']['created_at'][5:16]
                            tweetDate = datetime.strptime(dateStr, '%d %b %Y')
                            geoList = geo.geo_normalize(jsonTweet)
                            city, ctry, state = geoList[:3]
                            if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate):
                                userId, realName = None, None
                                if 'twiiter' in jsonTweet:
                                    if 'user' in jsonTweet['twitter']:
                                        if 'screen_name' in jsonTweet['twitter']['user']:
                                            userId = jsonTweet['twitter']['user']['screen_name'].lower()
                                        if 'name' in jsonTweet['twitter']['user']:
                                            realName = jsonTweet['twitter']['user']['name'].lower()
                                if userId is None and realName is None:
                                    continue
                                log.debug('userId or realName is not None')
                                candidate = getCandidate(userId, realName, uids)
                                if candidate is not None:
                                    log.debug('found candidate--> ' + candidate)
                                    # prereProcess the tweet
                                    text = jsonTweet["interaction"]["content"]
                                    text = re.sub(URL_REGEX, ' ', text)  # remove urls
                                    text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True))  # allow only alphaNumerics and twitter tags
                                    text = re.sub(' +', ' ', text)  # remove multiple spaces
                                    hashTags = extract_hash_tags(text)
                                    hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3]
                                    for hashTag in hashTags:
                                        if hashTag.startswith('#'):
                                            hashTag = hashTag[1:]
                                        if hashTag in hashTagCounts[candidate]:
                                            hashTagCounts[candidate][hashTag] += 1
                                        else:
                                            hashTagCounts[candidate][hashTag] = 1
                        except Exception, e:
                            log.exception('error processing tweet %s' % e)
            except Exception, f:
                log.exception('error processing file %s' % f)
        else:
            log.debug('skipping file %s ' % _file)
Esempio n. 32
0
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument(
        '--cat',
        action="store_true",
        help='Read input from standard in and write to standard out.')
    ap.add_argument('--region',
                    metavar='REGION',
                    type=str,
                    default=None,
                    help='Specify region to filter by')
    arg = ap.parse_args()
    logs.init(arg)
    filter_region = arg.region
    geoc = GeoCountry()
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = codecs.getwriter('utf-8')(sys.stdout)
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    tweet = annotate(tweet, geoc, filter_region)
                    if tweet is not None:
                        outs.write(json.dumps(tweet, ensure_ascii=False))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', entry)

        else:
            queue.init(arg)
            iqueue.init(arg)
            qname = "{}-geoCountry-{}".format(os.environ["CLUSTERNAME"],
                                              filter_region)
            with iqueue.open(arg.sub, 'r', qname=qname) as inq:
                with queue.open(arg.pub,
                                'w') as outq:  # , capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = annotate(tweet, geoc, filter_region)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".',
                                          tweet)

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{0!s}.".format(e))
        return 1
Esempio n. 33
0
def main():
    logs.init(l=logs.DEBUG)
    print(get_data(sources=['Warnings']))
Esempio n. 34
0
def main():
    """
    Utility to cache messages from a queue into Elasticsearch
    -q | --queue   : Read from <queue> and write the messages to Elasticsearch. Settings are read from embers.conf
    --log_file     : Path to write the log file to
    --log_level    : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('-q',
                            '--queue',
                            help='Queue name to index into Elasticsearch')
    arg_parser.add_argument(
        '-s',
        '--s3fromq',
        action='store_true',
        help='ingest from S3 prefix derived from queue name')
    arg_parser.add_argument('-p', '--prefix', help='Ingest from prefix')
    #arg_parser.add_argument('-t', '--typename', default='noqueue', help='Type for prefix ingest')
    arg_parser.add_argument('-t', '--typename', help='Type for prefix ingest')
    arg_parser.add_argument(
        '-l',
        '--tmpcopy',
        default='/home/embers/data/tmpcopy',
        help='Name of local copy of S3 file (same for all S3 files)')
    arg_parser.add_argument('-c',
                            '--chunk',
                            type=int,
                            default=100,
                            help='Chunk size for S3 ingest')
    arg_parser.add_argument('-i',
                            '--clustername',
                            help='Clustername to determine index name')
    arg_parser.add_argument(
        '-w',
        '--withbase',
        action="store_true",
        help="Add basename to prefix when looking for type.")
    arg_parser.add_argument('--startdate',
                            help='start date in format like 2015-01-02')
    arg_parser.add_argument('--enddate',
                            help='end date in format like 2015-01-02')
    arg = arg_parser.parse_args()

    #assert (arg.queue or (arg.prefix and arg.typename)), 'Either --queue (with optional --s3fromq/--typename) or --prefix with --typename must be provided'
    assert (
        arg.queue or arg.prefix
    ), 'Either --queue (with optional --s3fromq/--typename) or --prefix  must be provided'

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    index_name = general.get_index_name(arg.clustername)

    queue.init()

    if arg.prefix or (arg.queue and arg.s3fromq):
        if arg.prefix:
            prefix = arg.prefix
            # get queue name or its substitute for S3 objects from prefix
            if arg.typename:
                type_name = arg.typename
            else:
                type_name = queue.conf.get_prefixpair(
                    prefix=prefix, includeS3=True, withBasename=arg.withbase)
                if not type_name:
                    log.error("Could not get type from prefix %s" % prefix)
                    return 1
                log.warning("type_name=%s from prefix=%s" %
                            (type_name, prefix))
        else:
            type_name = arg.queue
            prefix, include = queue.conf.get_prefix_for_queue(
                type_name, withBasename=False)
            if not prefix:
                log.error("Could not get S3 prefix for queue %s" % type_name)
                return 1

        if not general.get_es_connection().indices.exists_type(
                index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        conn_s3 = boto.connect_s3(aws_access_key_id=arg.aws_key,
                                  aws_secret_access_key=arg.aws_secret)
        bucket = conn_s3.get_bucket(
            arg.bucket)  # connect to S3, get bucket ptr for arg.bucket
        attach_to_s3(index_name,
                     s3prefix=prefix,
                     bucket=bucket,
                     type_name=type_name,
                     tmpcopy=arg.tmpcopy,
                     chunk_size=arg.chunk,
                     startdate=arg.startdate,
                     enddate=arg.enddate)
    else:

        if arg.typename:
            type_name = arg.typename
        else:
            type_name = arg.queue

        if not general.get_es_connection().indices.exists_type(
                index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        attach_to_queue(index_name=index_name,
                        queue_name=arg.queue,
                        type_name=type_name)
Esempio n. 35
0
def execute(arg):
    logs.init(arg)
    log.info("*************************************")
    log.info("PSL 4 Elections pipeline initializing")
    log.info("tweet folder------> " + arg.tweetFolder)
    log.info("dataFolder--------> " + str(arg.dataFolder))
    log.info("fromDate----------> " + arg.fromDate)
    log.info("toDate------------> " + arg.toDate)
    log.info("window------------> " + str(arg.window) + " day(s)")
    log.info("userThreshold-----> " + str(arg.userThreshold))
    log.info("wordThreshold-----> " + str(arg.wordThreshold))
    log.info("makePredictions---> " + arg.predictionFlag)
    log.info("*************************************")

    fromDate = datetime.strptime(arg.fromDate, "%d %b %Y")
    toDate = datetime.strptime(arg.toDate, "%d %b %Y")
    currentDate = fromDate

    iterCount = 1
    membership = {}
    likes = {}
    hates = {}

    if arg.predictionFlag == '1':
        makePredictions(arg.dataFolder + '/' + arg.country, fromDate, toDate, arg.userThreshold, arg.configFile, arg.regressionFlag)
        sys.exit()

    while(currentDate <= toDate):
        log.info("iterCount--------------->" + str(iterCount))
        log.info("processing PSL pipeline for %s" % (currentDate.strftime("%d %b %Y")))

        log.debug("creating the directory substructure for current date")
        inputFolder = arg.dataFolder + '/' + arg.country + '/' + currentDate.strftime("%d%b") + '/inputs'
        outputFolder = arg.dataFolder + '/' + arg.country + '/' + currentDate.strftime("%d%b") + '/outputs'
        statsFolder = arg.dataFolder + '/' + arg.country + '/' + currentDate.strftime("%d%b") + '/stats'
        os.system('mkdir -p ' + inputFolder)
        os.system('mkdir -p ' + outputFolder)
        os.system('mkdir -p ' + statsFolder)

        nextDate = currentDate + timedelta(days=arg.window)
        if(nextDate <= toDate):
            log.debug("creating the directory substructure for next date")
            nextInputFolder = arg.dataFolder + '/' + arg.country + '/' + nextDate.strftime("%d%b") + '/inputs'
            nextOutputFolder = arg.dataFolder + '/' + arg.country + '/' + nextDate.strftime("%d%b") + '/outputs'
            nextStatsFolder = arg.dataFolder + '/' + arg.country + '/' + nextDate.strftime("%d%b") + '/stats'
            os.system('mkdir -p ' + nextInputFolder)
            os.system('mkdir -p ' + nextOutputFolder)
            os.system('mkdir -p ' + nextStatsFolder)

        if iterCount == 1:
            keywordList = []
        # adding seed words to the list
            seedWordList = []
            with open(arg.seedFile, 'r') as file:
                for line in file:
                    word, group, weight = line.split(',')
                    seedWordList.append(word)
        keywordList = list(set(keywordList).union(set(seedWordList)))

        log.info("copying the seedFile to inputFolder")
        os.system('cp ' + arg.seedFile + ' ' + inputFolder + '/seedWords.csv')

        preProcess(arg.tweetFolder, inputFolder, keywordList, currentDate, nextDate, arg.country)
        log.info("***********preProcess complete******************")

        executePSLCode(inputFolder, outputFolder, arg.classPathFile)
        log.info("***********PSL code complete********************")

        postProcess(outputFolder, nextInputFolder, statsFolder, arg.userThreshold, arg.wordThreshold, membership, likes, hates)
        log.info("**********postProcess complete*****************")

        log.info("deleting the database used for current iteration")
        os.system('rm /home/aravindan/Dropbox/git/ms_thesis/psl/electionLDADB*')

        iterCount += 1
        currentDate = nextDate
        log.info("**********************************************************")
Esempio n. 36
0
import xlrd
import json
import datetime
import hashlib
import os
import argparse
import sys
from etool import logs
from Util import calculator
import sqlite3 as lite

"""
    Initiate the Json data for all the Indices
"""
logs.init()
__processor__ = os.path.basename(__file__.split(".")[0])
log =  logs.getLogger(__processor__)

def transcsv2json(xlsFile):
    wb = xlrd.open_workbook(xlsFile)
    sh = wb.sheet_by_name('Sheet1')
    #read the stock index from file
    stockIndex = sh.row_values(0,0)[0].split(" ")[0]
    stockPrices = []
    for rownum in range(2,sh.nrows):
        try:
            time_tuple = xlrd.xldate_as_tuple(sh.row_values(rownum,0)[0],0)
            post_date = datetime.datetime.strftime(datetime.date(time_tuple[0],time_tuple[1],time_tuple[2]),"%Y-%m-%d")
            lastPrice = float(sh.row_values(rownum,0)[1])
            previousCloseValue = float(sh.row_values(rownum,0)[3])
           
Esempio n. 37
0
def main():
    ap = args.get_parser()
    ap.add_argument('-t', '--tweetFolder', type=str,
                    help='inputFolder pointing to PSLs output',
                    default='/hdd/tweets/2012/oct')
    ap.add_argument('-df', '--dataFolder', type=str,
                    help='folder to store intermediate outputs and final outputs',
                    default='/home/aravindan/Dropbox/git/ms_thesis/data/psl')
    ap.add_argument('-ut', '--userThreshold', type=float,
                    help='probability threshold of user membership',
                    default=0.60)
    ap.add_argument('-wt', '--wordThreshold', type=float,
                    help='probability threshold for vocab',
                    default=0.70)
    ap.add_argument('-s', '--seedFile', type=str,
                    help='seed File containing the intial seed vocabulary',
                    default='/home/aravindan/Dropbox/git/ms_thesis/psl/seedWords/venezuela.csv')
    ap.add_argument('-c', '--country', type=str,
                    help='country to model elections for',
                    default='venezuela')
    ap.add_argument('-w', '--window', type=int,
                    help='number of days of tweets used to infer',
                    default=1)
    ap.add_argument('-d1', '--fromDate', type=str,
                    help='date from which to track tweets',
                    default='01 Oct 2012')
    ap.add_argument('-d2', '--toDate', type=str,
                    help='date to which to track tweets',
                    default='06 Oct 2012')
    ap.add_argument('-cp', '--classPathFile', type=str,
                    help='file containing class path for PSL execution',
                    default='/home/aravindan/Dropbox/git/ms_thesis/psl/classPathFile.txt')

    arg = ap.parse_args()
    logs.init(arg)

    log.info("*************************************")
    log.info("PSL 4 Elections pipeline initializing")
    log.info("tweet folder------> " + arg.tweetFolder)
    log.info("dataFolder--------> " + str(arg.dataFolder))
    log.info("fromDate----------> " + arg.fromDate)
    log.info("toDate------------> " + arg.toDate)
    log.info("window------------> " + str(arg.window) + " day(s)")
    log.info("userThreshold-----> " + str(arg.userThreshold))
    log.info("userThreshold-----> " + str(arg.userThreshold))
    log.info("*************************************")

    fromDate = datetime.strptime(arg.fromDate, "%d %b %Y")
    toDate = datetime.strptime(arg.toDate, "%d %b %Y")
    currentDate = fromDate

    iterCount = 1
    membership = {}
    vocab = {}
    filesProcessed = []

    while(currentDate <= toDate):
        log.info("iterCount--------------->" + str(iterCount))
        log.info("processing PSL pipeline for %s" % (currentDate.strftime("%d %b %Y")))

        log.info("creating the directory substructure for current iteration")
        inputFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount) + '/inputs'
        outputFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount) + '/outputs'
        statsFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount) + '/stats'
        os.system('mkdir -p ' + inputFolder)
        os.system('mkdir -p ' + outputFolder)
        os.system('mkdir -p ' + statsFolder)

        log.info("creating the directory substructure for next iteration")
        nextInputFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount + 1) + '/inputs'
        nextOutputFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount + 1) + '/outputs'
        nextStatsFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount + 1) + '/stats'
        os.system('mkdir -p ' + nextInputFolder)
        os.system('mkdir -p ' + nextOutputFolder)
        os.system('mkdir -p ' + nextStatsFolder)

        if iterCount == 1:
            keywordList = []
        # adding seed words to the list
        seedWordList = []
        with open(arg.seedFile, 'r') as file:
            for line in file:
                word, group, weight = line.split(',')
                seedWordList.append(word)
        keywordList = list(set(keywordList).union(set(seedWordList)))

        log.info("copying the seedFile to inputFolder")
        os.system('cp ' + arg.seedFile + ' ' + inputFolder + '/seedWords.csv')

        fileDate, filesProcessed = preProcess(arg.tweetFolder, inputFolder, keywordList, currentDate, toDate, arg.country, list(set(filesProcessed)))
        log.info("***********preProcess complete******************")

        executePSLCode(inputFolder, outputFolder, arg.classPathFile)
        log.info("***********PSL code complete********************")

        keywordList = postProcess(outputFolder, nextInputFolder, statsFolder, arg.userThreshold, arg.wordThreshold, membership, vocab, seedWordList, currentDate)
        log.info("**********postProcess complete*****************")

        if fileDate > currentDate:
            currentDate = fileDate

        log.info("deleting the database used for current iteration")
        os.system('rm /home/aravindan/Dropbox/git/ms_thesis/psl/electionPSLDB*')

        iterCount += 1
        log.info("**********************************************************")
Esempio n. 38
0
def main():
    ap = args.get_parser()
    ap.add_argument(
        "-t", "--tweetFolder", type=str, help="inputFolder pointing to PSLs output", default="/hdd/tweets/2012/oct"
    )
    ap.add_argument(
        "-df",
        "--dataFolder",
        type=str,
        help="folder to store intermediate outputs and final outputs",
        default="/home/aravindan/Dropbox/git/ms_thesis/data/dqe",
    )
    ap.add_argument(
        "-wt", "--wordThreshold", type=float, help="n-percent of words to propagate to next iteration", default=25
    )
    ap.add_argument(
        "-s",
        "--seedFile",
        type=str,
        help="seed File containing the intial seed vocabulary",
        default="/home/aravindan/Dropbox/git/ms_thesis/psl/seedWords/venezuela.csv",
    )
    ap.add_argument("-d1", "--fromDate", type=str, help="date from which to track tweets", default="01 Oct 2012")
    ap.add_argument("-d2", "--toDate", type=str, help="date to which to track tweets", default="06 Oct 2012")
    ap.add_argument("-w", "--window", type=int, help="number of days of tweets used to infer", default=1)
    ap.add_argument("-c", "--country", type=str, help="country to execute the pipeline for", default="venezuela")

    arg = ap.parse_args()
    logs.init(arg)

    log.info("*************************************")
    log.info("PSL 4 Elections pipeline initializing")
    log.info("tweet folder------> " + arg.tweetFolder)
    log.info("dataFolder--------> " + str(arg.dataFolder))
    log.info("fromDate----------> " + arg.fromDate)
    log.info("toDate------------> " + arg.toDate)
    log.info("window------------> " + str(arg.window) + " day(s)")
    log.info("country-----------> " + arg.country)
    log.info("wordThreshold-----> " + str(arg.wordThreshold) + "%")
    log.info("*************************************")

    fromDate = datetime.strptime(arg.fromDate, "%d %b %Y")
    toDate = datetime.strptime(arg.toDate, "%d %b %Y")
    currentDate = fromDate

    iterCount = 1
    vocab = {}

    while currentDate <= toDate:
        log.info("iterCount--------------->" + str(iterCount))
        log.info("processing PSL pipeline for %s" % (currentDate.strftime("%d %b %Y")))

        log.info("creating the directory substructure for current date")
        outputFolder = arg.dataFolder + "/" + arg.country + "/" + currentDate.strftime("%d%b")
        os.system("mkdir -p " + outputFolder)

        nextDate = currentDate + timedelta(days=arg.window)

        if iterCount == 1:
            with open(arg.seedFile, "r") as file:
                for line in file:
                    word, group, weight = line.split(",")
                    if group in vocab:
                        vocab[group][word] = weight
                    else:
                        vocab[group] = {}
                        vocab[group][word] = weight

        counts = trackTweets(arg.tweetFolder, vocab, currentDate, nextDate, arg.country, arg.wordThreshold)
        log.info("***********trackTweets complete***************")

        # normalize the counts for each group
        for group in counts:
            max = 0
            for word in counts[group]:
                if counts[group][word] > max:
                    max = counts[group][word]
            for word in counts[group]:
                weight = counts[group][word] / max
                vocab[group][word] = weight

        # dumping the vocab learnt
        for group in vocab:
            with open(outputFolder + "/" + group + "_vocab.csv", "w") as f:
                sorted_tuples = sorted(vocab[group].iteritems(), key=operator.itemgetter(1), reverse=True)
                for (word, weight) in sorted_tuples:
                    f.write(word + "," + str(weight))
                    f.write("\n")

        currentDate = nextDate
        iterCount += 1
        log.info("*********************************************")

    log.info("************ALL iterations complete********************")
Esempio n. 39
0
def main():
    global CONFIG, VOCABULARY_FILE, WARNING_PORT, SURROGATE_PORT, __version__, KEY_ID, SECRET, T_EASTERN, T_UTC

    "Initiate the TimeZone Setting"
    T_UTC = pytz.utc
    T_EASTERN = pytz.timezone("US/Eastern")

    "Get the input arg"
    arg = parse_args()
    rege_date = arg.rege_date
    KEY_ID = arg.aws_key
    SECRET = arg.aws_secret
    logs.init(arg)
    queue.init(arg)
    conn = boto.connect_sdb(KEY_ID, SECRET)
    #initiate simpleDB domains
    surrogateDomain = get_domain(conn, arg.surrogate_domain)
    warningDomain = get_domain(conn, arg.warning_domain)

    WARNING_PORT = arg.warning_port
    SURROGATE_PORT = arg.surrogate_port
    "if rege_date is not none, it means to regenerate the past day's prediction"
    if not rege_date:
        "Normal predict"
        predict_date = arg.predict_date
        model_cfg = arg.model_cfg

        stock_list = None
        if arg.stock_list:
            stock_list = arg.stock_list
        "Get the Latest version of Config Object"
        f = open(model_cfg, "r")
        configObj = json.load(f)
        f.close()
        con_versionNum = max([int(v) for v in configObj.keys()])
        CONFIG = configObj[str(con_versionNum)]
        "Get the Latest version of Trend Range Object"
        clusterTrends = json.load(sys.stdin)
        trend_versionNum = max([int(v) for v in clusterTrends.keys()])
        CONFIG["trendRange"] = {"version": str(trend_versionNum), "range": clusterTrends[str(trend_versionNum)]}

        if not stock_list:
            stock_list = CONFIG["stocks"]

        #"Retrain the model configuration if current day is Saturday"
        #weekDay = datetime.strptime(predict_date,"%Y-%m-%d").weekday()
        #if weekDay == 5:
        #    finalClusterProbability, finalClusterMatrix = re_training(surrogateDomain, predict_date, stock_list)
        #    new_config = json.loads(json.dumps(CONFIG))
        #    new_config["clusterProbability"] = finalClusterProbability
        #    new_config["clusterContribution"] = finalClusterMatrix
        #    "Write back to configure file"
        #    new_version_num = con_versionNum + 1
        #    new_config["version"] = new_version_num
        #    configObj[str(new_version_num)] = new_config
        #    with open(model_cfg, "w") as out_q:
        #        out_q.write(json.dumps(configObj))

        "Process stock each by each"
        for stock in stock_list:
            surrogate = process_single_stock(surrogateDomain, predict_date,stock)
            if surrogate:
                warning = warning_check(warningDomain, surrogate)

    else:
        "regenerate the old prediction"
        model_cfg = arg.model_cfg
        stock_list = None
        if arg.stock_list:
            stock_list = arg.stock_list

        "Get the version of Config Object for the indicated prediction"
        versionObj = get_predicion_version(warningDomain, rege_date)
        configVersionNum = versionObj["configVersion"]
        trendVersionNum = versionObj["trendVersion"]

        configObj = json.load(open(model_cfg))
        if configVersionNum in configObj:
            CONFIG = configObj[configVersionNum]
        else:
            CONFIG = configObj["1"]

        "Get the Latest version of Trend Range Object"
        clusterTrends = json.load(sys.stdin)
        #get the latest version of the warning
        tmpVersion = int(trendVersionNum)
        while tmpVersion >= 1:
            if str(tmpVersion) in clusterTrends:
                trendVersionNum = str(tmpVersion)
                break
            else:
                tmpVersion -= 1

        CONFIG["trendRange"] = {"version": str(trendVersionNum), "range": clusterTrends[trendVersionNum]}

        if not stock_list:
            stock_list = CONFIG["stocks"]

        "Process stock each by each"
        for stock in stock_list:
            replayIO = StringIO.StringIO()
            surrogate = process_single_stock(surrogateDomain, rege_date, stock, True, replayIO)
            if surrogate:
                warning = warning_check(warningDomain, surrogate, True, replayIO)
            replayInfo = replayIO.getvalue()
            weid = getwarningeid(surrogateDomain, rege_date, stock)
            with open("./demo/%s.txt" % weid, "w") as win:
                win.write(replayInfo)

    if conn:
        conn.close()
Esempio n. 40
0
    def __init__(self, wg_data=WG_DATA, co_admin_data=CO_ADMIN_DATA,
                 priority_policy=PRIORITY_POLICY,
                 debug=False):
        """
        """
        self.priority_policy = priority_policy
        self.debug = debug
        self.__version__ = "{0}-{1}-{2}-{3}-{4}".format(
            self.__class__.__name__,
            __version__,
            hashlib.md5(get_wg_data(wg_data).read()).hexdigest(),
            hashlib.md5(get_co_admin_data(co_admin_data).read()).hexdigest(),
            hashlib.md5(" ".join(self.priority_policy)).hexdigest())

        if self.debug:
            try:
                logs.init()
            except IOError:  # , err:
                logs.init(logfile=self.__class__.__name__.lower())

            self.log = logs.getLogger("{0}-{1}".format(
                                      self.__class__.__name__,
                                      __version__.replace('.', '_')))

        # 1. load country and admin1 level geo data
        f = get_co_admin_data(co_admin_data)
        dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t")
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect, fieldnames=CO_ADMIN_FIELDS)
        # NOTE:
        # Known conflicts b/w codes of countries and other admins
        # co Colombia ('Colombia', 'C\xc3\xb3rdoba')
        # cl Chile ('Colombia', 'Caldas')
        # ar Argentina ('Colombia', 'Arauca')
        # sv El Salvador ('El Salvador', 'San Vicente')

        # prep lookup dictionaries
        # key__value

        # countries
        self.co_code = {}
        self.co_names = {}
        self.co_aliases = {}
        self.co_capital_cities = {}
        # admin1
        self.admin_code = {}
        self.admin_name = {}
        # assumes countries appear first when reading data from
        # lac_co_admin TODO BAD!
        for r in reader:
            for k in r.keys():
                r[k] = r[k].strip()
            lat = float_or_none(r['latitude'])
            lon = float_or_none(r['longitude'])
            code = object_or_none(r['iso_3166_code'])
            rid = int_or_none(r["id"])
            if r['type'] == 'country':
                # country
                if code:
                    self.co_code[code] = r['name']
                    self.co_names[nstr(r['name'])] = (rid, lat, lon,
                                                      code, r['name'])
                    self.co_capital_cities[nstr(r['capital_city'])] =\
                        (r['capital_city'], r['name'])
                    aliases = r['alt_names'].split(',')
                    self.co_aliases.update({nstr(alias.strip()): r['name']
                                            for alias in aliases})
                else:
                    if self.debug:
                        self.log.error("Bad data country {0} Code {1}".format(
                                       r['name'], code))
            elif r['type'] == 'admin':
                # admin
                admin, co = r['full_name'].split(',')
                admin, co = admin.strip(), co.strip()

                if code:
                    if code not in self.admin_code:
                        self.admin_code[code] = []
                    self.admin_code[code].append((co, admin))
                co1, a = nstr(co), nstr(admin)
                if a not in self.admin_name:
                    self.admin_name[a] = {}
                if co1 not in self.admin_name[a]:
                    self.admin_name[a][co1] = (rid, lat, lon, code, admin, co)

        f.close()

        # 2. load (world-gazeteer) city level geo data
        f = get_wg_data(wg_data)
        dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t")
        f.seek(0)
        reader = csv.DictReader(f, dialect=dialect, fieldnames=WG_FIELDS)
        self.ci_aliases = {}
        # main data store for geocoding
        self.data = []
        counter = 0
        ci_set = set()
        for r in reader:
            for k in r.keys():
                r[k] = r[k].strip()
            # get alias names for cities
            ci_names = [a.strip() for a in r['alt_names'].split(',')
                        if len(a.strip()) > 0]
            ci_names.extend([a.strip() for a in r['orig_names'].split(',')
                             if len(a.strip()) > 0])
            for ci in ci_names:
                k = (nstr(ci), nstr(r['country']))
                a1 = nstr(r['admin1'])
                if k not in self.ci_aliases:
                    self.ci_aliases[k] = {a1: set([r['name']])}
                elif a1 not in self.ci_aliases[k]:
                    self.ci_aliases[k][a1] = set([r['name']])
                else:
                    # Cases where different cities for same
                    # admin-country pair have the same alias
                    self.ci_aliases[k][a1].add(r['name'])
                # add ci name aliases into ci_set
                ci_set.add(nstr(ci))
            # store only cannonical cities names
            self.data.append((counter, (r['name'], r['country'],
                              r['admin1'],
                              object_or_none(r['admin2']),
                              object_or_none(r['admin3']),
                              int_or_none(r['pop']),
                              float_or_none(r['latitude']) / 100,
                              float_or_none(r['longitude']) / 100,
                              int(r['id']), int(r['padded']))))
            counter += 1

        self.coordinates = {}
        # cases where admin1 and city share the same name
        # extended feature/hack #1 to resolve city when
        # only country and admin1 are specified
        self.same_ci_a1_name = {}
        for i, (n, c, a1, a2, a3, p, lat, lon, i_d, pad) in self.data:
            nn, nc, na1 = nstr(n),  nstr(c), nstr(a1)
            self.coordinates[(lat, lon)] = i
            if nn == na1 and pad == 0:
                self.same_ci_a1_name[(nc, na1)] = n
            ci_set.add(nn)

        # store (lat, lon)
        self.kdtree = KDTree([[i, j] for i, j in self.coordinates.keys()
                              if i is not None and j is not None])
        # build regular expr dicts
        co_set = set(self.co_names.keys())
        # add country name aliases into co_set
        co_set.update(self.co_aliases.keys())
        self.co_reg = ManyRE(co_set)
        self.ci_reg = ManyRE(ci_set)
        # add admin1 name aliases into admin1_set
        admin1_set = set(self.admin_name.keys())
        # build regular expression stores for co-admin1-ci
        self.admin1_reg = ManyRE(admin1_set)
        # add stopwords to prevent any 2-letter word in common usage
        # to be mis-interpretted as country or admin code
        two_letter_stop_words = set(
            ['BE', 'WE', '\xc3\xa0', 'YO', 'DO', 'YA', 'DE', 'DA', 'HA', 'BY',
             'HE', 'AL', 'NI', 'LE', 'NO', 'LO', 'TU', 'TO', 'TI', 'TE', 'EM',
             'EL', 'EN', 'IS', 'OS', 'AM', 'IT', 'AO', 'AN', 'AS', 'AT', 'IN',
             'EU', 'ES', 'IF', 'ME', 'ON', 'OF', 'LA', 'MI', 'UP', 'SU', 'UM',
             'UN', 'SO', 'NA', 'OU', 'MY', 'OR', 'SE', 'US'])

        self.co_code_reg = ManyRE([sw for sw in self.co_code.keys()
                                  if sw not in two_letter_stop_words])
        self.admin1_code_reg1 = ManyRE(self.admin_code.keys())
        self.admin1_code_reg2 = ManyRE([sw for sw in self.admin_code.keys()
                                       if sw not in two_letter_stop_words])

        self.bguess = {}
        for i, (city, country, admin1, a2, a3, p, la, lo, i_d, pad)\
                in self.data:

            ci, co, a = nstr(city), nstr(country), nstr(admin1)
            # value is list of admin1's that correspond to ci-co key
            # ci-co makes dictionary flatter
            # choose not to use co-admin1-ci as key to add more flexibility
            # for lookups
            if ci in self.bguess:
                if co in self.bguess[ci]:
                    if a in self.bguess[ci][co]:
                        # store original wg-records marked with pad = 0
                        # to head of the queue
                        if pad == 0:
                            self.bguess[ci][co][a].appendleft(i)
                        else:
                            self.bguess[ci][co][a].append(i)
                    else:
                        self.bguess[ci][co][a] = deque([i])
                else:
                    self.bguess[ci][co] = {a: deque([i])}
            else:
                self.bguess[ci] = {co: {a: deque([i])}}