def main(): global log ap = args.get_parser() ap.add_argument("--s_date", type=str, help="the start date to ingest: format mmddyyyy") ap.add_argument("--e_date", type=str, help="the end of date to ingest: format mmddyyyy") ap.add_argument("--o", type=str, help="the output directory") ap.add_argument("--region", type=str, help="the region of the web site") arg = ap.parse_args() logs.init(arg) t_format = "%m%d%Y" s_date = datetime.strptime(arg.s_date, t_format) e_date = datetime.strptime(arg.e_date, t_format) d_delta = (e_date - s_date).days seen_it = shelve.open("%s_reuters_news_seen_it.db" % (arg.region)) i = 0 while i <= d_delta: day_str = datetime.strftime(s_date + timedelta(days=i), t_format) print "Extracting %s" % (day_str) "write news to day file" with open("%s%s%s_ita_reuters_%s.txt" % (arg.o, os.sep, day_str, arg.region), "w") as w: daily_news = get_daily_news(day_str, seen_it, arg.region) for news in daily_news: w.write(json.dumps(news) + "\n") i += 1
def main(): ap = args.get_parser() ap.add_argument('--rev_file', metavar='FILE', type=str, required=False, help='File which stores last revision id.', default="last_rev_file.txt") arg = ap.parse_args() logs.init(arg) log.info("Run Started") api = API() recent_changes = api.get_recent_changes() # Filter to only revisions newer than last run last_rev_file = arg.rev_file if os.path.exists(last_rev_file): with open(last_rev_file) as f: last_rev = int(f.read()) recent_changes = [change for change in recent_changes if change['revid'] > last_rev] recently_changed_page_ids = set(str(change['pageid']) for change in recent_changes) latest_revisions = api.get_latest_revision(recently_changed_page_ids) print(len(latest_revisions)) print(latest_revisions) # TODO actually do something with revisions with open(last_rev_file, mode='w') as f: f.write(str(max([int(page['revisions'][0]['revid']) for page in latest_revisions.values()])))
def main(): """ Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf --hostname : Cache all active queues on this host --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument('--hostname', metavar='HOSTNAME', type=str, default=environ.get('HOSTNAME', None), help="The hostname of the machine whose services' data you wish to cache") arg = arg_parser.parse_args() log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) conf.init(arg) assert arg.hostname, '--hostname must be provided' queues = conf.get_all_cached_queues(hostname=arg.hostname) pool = [] for queue in queues: log.info('Spawning cache process for %s' % queue) p = multiprocessing.Process(name=queue, target=cache_queue, args=(queue,)) p.start() pool.append(p) try: for process in pool: process.join() log.warn('%s caching has stopped' % process.name) except KeyboardInterrupt: log.warn('Keyboard interrupt in main')
def main(): ap = args.get_parser() ap.add_argument('--test', action="store_true", help="Test Flag, if contain this argument, it means a test case") arg = ap.parse_args() assert arg.sub, 'Need a queue to subscribe to' assert arg.pub, 'Need a queue to publish to' logs.init(arg) queue.init(arg) test_flag = arg.test conn = boto.connect_sdb() with queue.open(arg.sub, 'r') as inq: for m in inq: try: durationProcess(conn, m, arg.pub, test_flag) except KeyboardInterrupt: log.info('GOT SIGINT, exiting!') break except EmbersException as e: log.exception(e.value) except: log.exception("Unexpected exception in process")
def init(): global con global cur con = common.getDBConnection() cur = con.cursor() logs.init()
def main(): ap = args.get_parser() ap.add_argument('-i', '--inputFolder', type=str, help='inputFolder contaning twitter files', default='/hdd/tweets/2012/may') ap.add_argument('-s', '--scoresFolder', type=str, help='Folder contaning scoreCards', default='../data/scores/MX/') ap.add_argument('-cf', '--configFile', type=str, help='election configuration file', default='../configFiles/electionConfig_MX') ap.add_argument('-d1', '--fromDate', type=str, help='fromDate') ap.add_argument('-d2', '--toDate', type=str, help='toDate') ap.add_argument('-f1', '--flag1', help="countOrPredict", type=str, default='2') ap.add_argument('-r', '--regression', help="regressionType", type=str, default='LASSO') ap.add_argument('-f2', '--flag2', help="flag to push surrogates and warning to S3", type=str, default='0') arg = ap.parse_args() logs.init(arg) try: elections = Elections(arg.inputFolder, arg.scoresFolder, arg.configFile, arg.fromDate, arg.toDate) log.info("Election class initialized") except Exception as e: log.exception("exception during intialization: %s. Quitting!!", e) try: if (arg.flag1 == '1' or arg.flag1 == '3'): elections.collectMentions() except Exception as e: log.exception("error while tracking tweets") try: if (arg.flag1 == '2' or arg.flag1 == '3'): winner, winningScore, runnerUp, runnerUpScore, finalScore = elections.getWinner(arg.fromDate, arg.toDate, arg.regression) print "------------Regression Results-----------" print finalScore print winner + "====>" + str(winningScore) print "-----------------------------------------" except Exception as e: log.exception("error while calculating winner:%s", e) try: elections.createSurrogate(winner, winningScore, runnerUp, runnerUpScore, arg.flag2) except Exception as e: log.exception("error during creating warnings") try: if (arg.flag2 == '1'): elections.storeStatistics(arg.fromDate, arg.toDate) except Exception as e: log.exception("error in storing statistics:%s", e) log.info("ALL Operations Complete")
def main(): # Initialize arguments argparser = args.get_parser() argparser.add_argument('--local_port', help='Local port to connect to java server', required=True) arg = argparser.parse_args() localPort = int(arg.local_port) # Initialize log logs.init(arg) global log # Initialize the queue with arguments and connect to the specified feed log.info("Opening and connecting to queue %s", arg.sub) queue.init(arg) reader = queue.open(arg.sub, 'sub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel) # Initialize the writer to publish to a queue log.info("Publishing to queue %s", arg.pub) writer = queue.open(arg.pub, 'pub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel) count = 0 # Connect to Java server while True: for feedmsg in reader: try: while True: try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(("localhost", localPort)) break except: log.info("Unable to connect to local server") log.debug("Connected to java server on port %d" % localPort) socketLines = sock.makefile() # Clean the message to fix irregularities feedmsg = message.clean(feedmsg) log.debug("Read message %d. Sending to java" % count) # Write message to socket stream sock.sendall(json.dumps(feedmsg)) sock.sendall('\n') # Receive result from socket stream result = socketLines.readline() writer.write(json.dumps(result)) count += 1 sock.close() except KeyboardInterrupt: sys.exit(1) else: log.info("Server was disconnected.")
def main(): #initiate parameters global TREND_RANGE "Initiate the TimeZone Setting" arg = parse_args() conn = boto.connect_sdb() operate_date = arg.operate_date start_date = arg.start_date end_date = arg.end_date port = arg.pub assert port, "Need a queue to publish to" logs.init(arg) queue.init(arg) t_domain = get_domain(conn, 't_enriched_bloomberg_prices') #trend_file = args.trend_file # "Load the trend changeType range file" trendObject = None trendObject = json.load(sys.stdin) # "Get the latest version of Trend Ranage" trend_versionNum = max([int(v) for v in trendObject.keys()]) # "To avoid changing the initiate values, we first transfer the json obj to string ,then load it to create a news object" TREND_RANGE = json.loads(json.dumps(trendObject[str(trend_versionNum)])) # "If input a date range, then we will handle all the data query from those days" if start_date is None: #get raw price list raw_price_list = [] rs = get_raw_data(conn, operate_date) for r in rs: raw_price_list.append(r) for raw_data in raw_price_list: process(t_domain, port, raw_data) else: t_format = "%Y-%m-%d" s_date = datetime.strptime(start_date, t_format) e_date = datetime.strptime(end_date, t_format) while s_date <= e_date: raw_price_list = [] rs = get_raw_data(conn, datetime.strftime(s_date, t_format)) for r in rs: raw_price_list.append(r) for raw_data in raw_price_list: process(t_domain, port, raw_data) s_date = s_date + timedelta(days=1) # "sleep 5 s to wait simpleDB to commit" time.sleep(5) #"Write back the trendFile" new_version_num = trend_versionNum + 1 trendObject[str(new_version_num)] = TREND_RANGE json.dump(trendObject, sys.stdout)
def main(): logs.init(l=logs.DEBUG) # TODO translate non-English characters # print(get_tweets(keywords='esta')) # print(aggregate_tweets_over_time()) # print(aggregate_tweets_by_country()) # print(aggregate_tweets_by_country(lemmas='bill')) # print(aggregate_tweets_by_country(country='brazil', lemmas='bill')) # print(aggregate_tweets_by_country(country='brazil', lemmas='bill', entities='')) print(get_abbreviated_tweets_for_dqe('brazil', 'datasift-keyword', start_date='2015-03-27', end_date='2015-03-27'))
def main(): ''' Reads the from the queue, retrieves the content from the source website and publishes the content to a new queue. ''' ap = args.get_parser() ap.add_argument( '--cat', action="store_true", help='Read input from standard in and write to standard out.') arg = ap.parse_args() logs.init(arg) geo_mena = GeoMena() geo_lac = Geo(geo_region=GEO_REGION.lac) try: if arg.cat: log.debug('Reading from stdin and writing to stdout.') ins = sys.stdin outs = sys.stdout for entry in ins: entry = entry.decode(encoding='utf-8') try: tweet = json.loads(entry.strip()) geo_annotate(tweet, geo_mena, geo_lac) if tweet is not None: outs.write( json.dumps(tweet, ensure_ascii=False).encode("utf-8")) outs.write('\n') outs.flush() except Exception: log.exception('Failed to process message "%s".', (entry, )) else: queue.init(arg) with queue.open(arg.sub, 'r') as inq: with queue.open(arg.pub, 'w', capture=True) as outq: for tweet in inq: try: content = geo_annotate(tweet, geo_mena, geo_lac) if content is not None: outq.write(content) except KeyboardInterrupt: log.info("Got SIGINT, exiting.") break except Exception: log.exception('Failed to process message "%s".', (tweet, )) return 0 except Exception as e: log.exception("Unknown error in main function-{}".format(str(e))) return 1
def main(): ''' Reads the from the queue, retrieves the content from the source website and publishes the content to a new queue. ''' ap = args.get_parser() ap.add_argument('--cat', action="store_true", help='Read input from standard in and write to standard out.') ap.add_argument('--region', metavar='REGION', type=str, default=None, help='Specify region to filter by') arg = ap.parse_args() logs.init(arg) filter_region = arg.region geoc = GeoCountry() try: if arg.cat: log.debug('Reading from stdin and writing to stdout.') ins = sys.stdin outs = codecs.getwriter('utf-8')(sys.stdout) for entry in ins: entry = entry.decode(encoding='utf-8') try: tweet = json.loads(entry.strip()) tweet = annotate(tweet, geoc, filter_region) if tweet is not None: outs.write(json.dumps(tweet, ensure_ascii=False)) outs.write('\n') outs.flush() except Exception: log.exception('Failed to process message "%s".', entry) else: queue.init(arg) iqueue.init(arg) qname = "{}-geoCountry-{}".format(os.environ["CLUSTERNAME"], filter_region) with iqueue.open(arg.sub, 'r', qname=qname) as inq: with queue.open(arg.pub, 'w') as outq: # , capture=True) as outq: for tweet in inq: try: content = annotate(tweet, geoc, filter_region) if content is not None: outq.write(content) except KeyboardInterrupt: log.info("Got SIGINT, exiting.") break except Exception: log.exception('Failed to process message "%s".', tweet) return 0 except Exception as e: log.exception("Unknown error in main function-{0!s}.".format(e)) return 1
def main(): ap = args.get_parser() ap.add_argument('--replay', action="store_true", help="Test Flag, if contain this argument, it means a test case") #if the rule file is not indicated in argument, it need to be load from sys.stdin ap.add_argument('--rulefile', type=str, help="The rule file for duration analysis model") arg = ap.parse_args() if not arg.replay: assert arg.sub, 'Need a queue to subscribe to' assert arg.pub, 'Need a queue to publish to' logs.init(arg) queue.init(arg) test_flag = arg.replay if arg.rulefile: rule = eval(open(arg.rulefile).read()) else: #load the rules from sys.stdin rule = eval(sys.stdin.read()) conn = boto.connect_sdb() if not arg.replay: with queue.open(arg.sub, 'r') as inq: for m in inq: try: replayIO = StringIO.StringIO() durationProcess(rule, conn, m, arg.pub, test_flag, replayIO) except KeyboardInterrupt: log.info('GOT SIGINT, exiting!') break except EmbersException as e: log.exception(e.value) except: log.exception("Unexpected exception in process") else: #replay model take enriched file as input enrich_messages = sys.stdin.readlines() for m in enrich_messages: m = json.loads(m.strip()) try: replayIO = StringIO.StringIO() durationProcess(rule, conn, m, arg.pub, test_flag, replayIO) except KeyboardInterrupt: log.info('GOT SIGINT, exiting!') break except EmbersException as e: log.exception(e.value) except: log.exception("Unexpected exception in process")
def main(): ap = args.get_parser() ap.add_argument('--o', type=str, help="the output dir to store news") arg = ap.parse_args() assert arg.o, 'Need a dir to store news' logs.init(arg) locale.setlocale(locale.LC_TIME, 'es_ES.utf-8') seen_it = shelve.open('elfinance_seen_it.db') cas = ['finanzas'] for ca in cas: get_category_news(ca, seen_it, arg.o)
def main(): ''' Reads the from the queue, retrieves the content from the source website and publishes the content to a new queue. ''' ap = args.get_parser() ap.add_argument('--cat', action="store_true", help='Read input from standard in and write to standard out.') arg = ap.parse_args() logs.init(arg) geo_mena = GeoMena() geo_lac = Geo(geo_region=GEO_REGION.lac) try: if arg.cat: log.debug('Reading from stdin and writing to stdout.') ins = sys.stdin outs = sys.stdout for entry in ins: entry = entry.decode(encoding='utf-8') try: tweet = json.loads(entry.strip()) geo_annotate(tweet, geo_mena, geo_lac) if tweet is not None: outs.write(json.dumps(tweet, ensure_ascii=False).encode("utf-8")) outs.write('\n') outs.flush() except Exception: log.exception('Failed to process message "%s".', (entry,)) else: queue.init(arg) with queue.open(arg.sub, 'r') as inq: with queue.open(arg.pub, 'w', capture=True) as outq: for tweet in inq: try: content = geo_annotate(tweet, geo_mena, geo_lac) if content is not None: outq.write(content) except KeyboardInterrupt: log.info("Got SIGINT, exiting.") break except Exception: log.exception('Failed to process message "%s".', (tweet,)) return 0 except Exception as e: log.exception("Unknown error in main function-{}".format(str(e))) return 1
def main(): logs.init(l=logs.DEBUG) # TODO translate non-English characters # print(get_tweets(keywords='esta')) # print(aggregate_tweets_over_time()) # print(aggregate_tweets_by_country()) # print(aggregate_tweets_by_country(lemmas='bill')) # print(aggregate_tweets_by_country(country='brazil', lemmas='bill')) # print(aggregate_tweets_by_country(country='brazil', lemmas='bill', entities='')) print( get_abbreviated_tweets_for_dqe('brazil', 'datasift-keyword', start_date='2015-03-27', end_date='2015-03-27'))
def main(): ap = args.get_parser() ap.add_argument('--f', type=str, help='the newes file') arg = ap.parse_args() assert arg.f, 'Need a file to ingest' assert arg.pub, 'Need a queue to publish' logs.init(arg) queue.init(arg) with queue.open(arg.pub, 'w') as q_w, open(arg.f, 'r') as f_r: for line in f_r: news = json.loads(line) q_w.write(news)
def main(): """ Utility for warnings stored in Elasticsearch --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg = arg_parser.parse_args() log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) print(query(max_results=30))
def initiate(): global newsAlreadyDownload global companyListDir global port global newsAlreadDownloadFilePath global companyList global dailyNewsOutPath args = parse_args() logs.init() newsAlreadDownloadFilePath = args.f_downloaded companyListDir = args.f_company_list port = args.port dailyNewsOutPath = args.f_out newsAlreadyDownload = json.load(open(newsAlreadDownloadFilePath)) companyList = json.load(open(companyListDir))
def main(): ap = args.get_parser() ap.add_argument('--out', help="the output file of warnings") arg = ap.parse_args() assert arg.sub, 'Need a queue to subcribe!' assert arg.out, 'Need a file to store warnings!' logs.init(arg) queue.init(arg) out_file = arg.out with queue.open(arg.sub, 'r') as q_r: for m in q_r: with open(out_file, "a") as out_w: if not check_ifexist(m): out_w.write(json.dumps(m) + "\n") else: print "Duplicated Warnings"
def main(): ap = args.get_parser() ap.add_argument('-i', '--input', default='sys.stdin', type=str, help='Path to the input file.' 'Default is sys.stdin') ap.add_argument('-o', '--out', default='sys.stdout', type=str, help='Path to the output file.' 'Default is sys.stdout') ap.add_argument('searchPhrase', default='config/phrases.txt', type=str, help='Path to ' 'the Phrase File if "-f" flag is specified, else the input string is considered' 'to be the phrase.') ap.add_argument('-f', '--file', action='store_true', default=False, help='If given, then the ' 'the searchPhrase argument is interpreted as path to a file') global logger logger = logs.getLogger("%s-%s.log" % (__processor__, str(datetime.now()))) arg = ap.parse_args() logs.init(args) inputFile = None outFile = None phraseFile = None if arg.input == 'sys.stdin': reader = codecs.getreader('utf-8')(sys.stdin) else: inputFile = open(arg.input, "r") reader = codecs.getreader('utf-8')(inputFile) if arg.out == 'sys.stdout': writer = codecs.getwriter('utf-8')(sys.stdout) else: outFile = codecs.open(arg.out, "w", encoding="utf-8") writer = codecs.getwriter('utf-8')(outFile) if arg.file: phraseFile = codecs.open(arg.searchPhrase, encoding='utf-8') generatePhraseList(phraseFile.readlines()) else: generatePhraseList([arg.searchPhrase]) phraseSearch(reader, writer) #close all files if inputFile: inputFile.close() if outFile: outFile.close() if phraseFile: phraseFile.close()
def main(): """ Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf --hostname : Cache all active queues on this host --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument( '--hostname', metavar='HOSTNAME', type=str, default=environ.get('HOSTNAME', None), help= "The hostname of the machine whose services' data you wish to cache") arg = arg_parser.parse_args() log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) conf.init(arg) assert arg.hostname, '--hostname must be provided' queues = conf.get_all_cached_queues(hostname=arg.hostname) pool = [] for queue in queues: log.info('Spawning cache process for %s' % queue) p = multiprocessing.Process(name=queue, target=cache_queue, args=(queue, )) p.start() pool.append(p) try: for process in pool: process.join() log.warn('%s caching has stopped' % process.name) except KeyboardInterrupt: log.warn('Keyboard interrupt in main')
def main(): """ Utility to set up a mapping for an EMBERS queue in Elasticsearch -q | --queue : Queue name to set up the mapping for. Settings are read from embers.conf --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument('-q', '--queue', help='Queue name to map into Elasticsearch') arg = arg_parser.parse_args() assert arg.queue, '--queue must be provided' log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) add_type(index_name=general.get_index_name(), type_name=arg.queue)
def main(): ap = args.get_parser() default_day = datetime.strftime(datetime.now(), "%Y-%m-%d") ap.add_argument("--d", type=str, default=default_day, help="The day to ingest, Format: dd/mm/yyyy") ap.add_argument("--domain", default="bloomberg_prices", help="The simpleDB table to store raw data") arg = ap.parse_args() assert arg.pub, "Need a queue to publish" logs.init(arg) queue.init(arg) with queue.open(arg.pub, "w") as out_q: for stock in STOCK_CON: if stock == "COLCAP": scrape_f = scrape_colcap_url if stock == "CHILE65": scrape_f = scrape_chile65_url msg = ingest_price(arg, stock, scrape_f) if msg is not None: out_q.write(msg) store(arg, msg)
def main(): ap = args.get_parser() ap.add_argument('-c', '--conf', metavar='CONF', type=str, nargs='?', default=os.path.join(os.path.dirname(__file__), 'bloomberg_news_ingest.conf'), help='The location of the configuration file.') arg = ap.parse_args() assert arg.pub, "--pub required. Need a queue to publish on" logs.init(arg) conf = get_conf(arg.conf) seen_it = shelve.open("bloomberg_news_seen_it.db") try: with queue.open(arg.pub, 'w', capture=True) as outq: for (index, companies) in conf.items(): for company in companies: articles = get_stock_news(index, company, seen_it) for a in articles: outq.write(a) except KeyboardInterrupt: log.info('GOT SIGINT, exiting')
def main(): ap = args.get_parser() ap.add_argument('--level', type=str, default="0.6", help='The threhold') ap.add_argument('--svm', action='store_true') ap.add_argument('--zmq', action='store_true') ap.add_argument('--surr', type=str, help="surrogate file") ap.add_argument('--warn', type=str, help="warning file") arg = ap.parse_args() logs.init(arg) queue.init(arg) assert arg.pub, "Please input a queue to publish warning" if arg.zmq: assert arg.sub, "Please input a queue to sub surrogate message" conn = boto.connect_sdb() t_domain = get_domain(conn, "s_holiday") if arg.zmq: with queue.open(arg.sub, 'r') as inq: for m in inq: try: if arg.svm: svm_warning(t_domain, m, arg.pub) else: warning_center(t_domain, m, arg.pub, float(arg.level)) except KeyboardInterrupt: log.info('GOT SIGINIT, exiting!') break except: log.exception("Exception in Process:%s" % sys.exc_info()[0]) else: with open(arg.warn, "w") as w, open(arg.surr) as r: if arg.svm: for m in r: m = json.loads(m) warning = svm_warning(t_domain, m, arg.pub) w.write(json.dumps(warning) + "\n")
def main(): ap = args.get_parser() ap.add_argument('--r_file', type=str, help="The rule file") ap.add_argument('--o', type=str, help="The output file") arg = ap.parse_args() assert arg.r_file, 'Need a rule file' assert arg.sub, 'Need a queue to subscribe' assert arg.o, 'Need a file to output' logs.init(arg) queue.init(arg) u_pattern = re.compile("http://(www\.){0,1}[^/]*/[a-z0-9/.\-]*(econ)[a-z0-9\.\-]*", flags=re.I) c_rule = create_label_rule(arg.r_file) g_rule = create_gold_lable(arg.r_file) c_pattern = re.compile(c_rule, flags=re.I) with queue.open(arg.sub, 'r') as q_r, codecs.open(arg.o, 'a') as f_a: for news in q_r: f_news = process(news, u_pattern, c_pattern, g_rule) if f_news is not None: f_a.write(json.dumps(f_news) + "\n") print f_news['date'], f_news['title'], "|", f_news['o_country'], "|", f_news["p_country"]
def main(): logs.init(l=logs.DEBUG) print(get_data(sources=['Warnings']))
def __init__(self,cfgPath): common.init(cfgPath) logs.init()
def execute(arg): logs.init(arg) fromDate = datetime.strptime(arg.fromDate, "%d %b %Y") toDate = datetime.strptime(arg.toDate, "%d %b %Y") tweetFolder = arg.tweetFolder country = arg.country hashTagCounts = {} uids = {} # loading twitter handles from a file with open(arg.seedFile, 'r') as _file: for line in _file: handle, candidate = line.strip().split(',') if candidate not in uids: uids[candidate] = [] hashTagCounts[candidate] = {} uids[candidate].append(handle.lower()) else: uids[candidate].append(handle.lower()) # for geolocation geo = Geo() for _file in sorted(os.listdir(tweetFolder)): fileDate = datetime.strptime(_file[17:27], '%Y-%m-%d') if (fileDate >= fromDate and fileDate < toDate): log.info("processing file %s" % (_file)) try: with open(tweetFolder + "/" + _file, "r") as FILE: for line in FILE: try: jsonTweet = json.loads(line.strip()) dateStr = jsonTweet['interaction']['created_at'][5:16] tweetDate = datetime.strptime(dateStr, '%d %b %Y') geoList = geo.geo_normalize(jsonTweet) city, ctry, state = geoList[:3] if ctry and (ctry.lower() == country) and (tweetDate >= fromDate) and (tweetDate <= toDate): userId, realName = None, None if 'twiiter' in jsonTweet: if 'user' in jsonTweet['twitter']: if 'screen_name' in jsonTweet['twitter']['user']: userId = jsonTweet['twitter']['user']['screen_name'].lower() if 'name' in jsonTweet['twitter']['user']: realName = jsonTweet['twitter']['user']['name'].lower() if userId is None and realName is None: continue log.debug('userId or realName is not None') candidate = getCandidate(userId, realName, uids) if candidate is not None: log.debug('found candidate--> ' + candidate) # prereProcess the tweet text = jsonTweet["interaction"]["content"] text = re.sub(URL_REGEX, ' ', text) # remove urls text = re.sub('[^A-Za-z_@#0-9]', ' ', normalize_str(text, lower=True)) # allow only alphaNumerics and twitter tags text = re.sub(' +', ' ', text) # remove multiple spaces hashTags = extract_hash_tags(text) hashTags = [hashTag for hashTag in hashTags if len(hashTag) > 3] for hashTag in hashTags: if hashTag.startswith('#'): hashTag = hashTag[1:] if hashTag in hashTagCounts[candidate]: hashTagCounts[candidate][hashTag] += 1 else: hashTagCounts[candidate][hashTag] = 1 except Exception, e: log.exception('error processing tweet %s' % e) except Exception, f: log.exception('error processing file %s' % f) else: log.debug('skipping file %s ' % _file)
def main(): ''' Reads the from the queue, retrieves the content from the source website and publishes the content to a new queue. ''' ap = args.get_parser() ap.add_argument( '--cat', action="store_true", help='Read input from standard in and write to standard out.') ap.add_argument('--region', metavar='REGION', type=str, default=None, help='Specify region to filter by') arg = ap.parse_args() logs.init(arg) filter_region = arg.region geoc = GeoCountry() try: if arg.cat: log.debug('Reading from stdin and writing to stdout.') ins = sys.stdin outs = codecs.getwriter('utf-8')(sys.stdout) for entry in ins: entry = entry.decode(encoding='utf-8') try: tweet = json.loads(entry.strip()) tweet = annotate(tweet, geoc, filter_region) if tweet is not None: outs.write(json.dumps(tweet, ensure_ascii=False)) outs.write('\n') outs.flush() except Exception: log.exception('Failed to process message "%s".', entry) else: queue.init(arg) iqueue.init(arg) qname = "{}-geoCountry-{}".format(os.environ["CLUSTERNAME"], filter_region) with iqueue.open(arg.sub, 'r', qname=qname) as inq: with queue.open(arg.pub, 'w') as outq: # , capture=True) as outq: for tweet in inq: try: content = annotate(tweet, geoc, filter_region) if content is not None: outq.write(content) except KeyboardInterrupt: log.info("Got SIGINT, exiting.") break except Exception: log.exception('Failed to process message "%s".', tweet) return 0 except Exception as e: log.exception("Unknown error in main function-{0!s}.".format(e)) return 1
def main(): """ Utility to cache messages from a queue into Elasticsearch -q | --queue : Read from <queue> and write the messages to Elasticsearch. Settings are read from embers.conf --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument('-q', '--queue', help='Queue name to index into Elasticsearch') arg_parser.add_argument( '-s', '--s3fromq', action='store_true', help='ingest from S3 prefix derived from queue name') arg_parser.add_argument('-p', '--prefix', help='Ingest from prefix') #arg_parser.add_argument('-t', '--typename', default='noqueue', help='Type for prefix ingest') arg_parser.add_argument('-t', '--typename', help='Type for prefix ingest') arg_parser.add_argument( '-l', '--tmpcopy', default='/home/embers/data/tmpcopy', help='Name of local copy of S3 file (same for all S3 files)') arg_parser.add_argument('-c', '--chunk', type=int, default=100, help='Chunk size for S3 ingest') arg_parser.add_argument('-i', '--clustername', help='Clustername to determine index name') arg_parser.add_argument( '-w', '--withbase', action="store_true", help="Add basename to prefix when looking for type.") arg_parser.add_argument('--startdate', help='start date in format like 2015-01-02') arg_parser.add_argument('--enddate', help='end date in format like 2015-01-02') arg = arg_parser.parse_args() #assert (arg.queue or (arg.prefix and arg.typename)), 'Either --queue (with optional --s3fromq/--typename) or --prefix with --typename must be provided' assert ( arg.queue or arg.prefix ), 'Either --queue (with optional --s3fromq/--typename) or --prefix must be provided' log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) index_name = general.get_index_name(arg.clustername) queue.init() if arg.prefix or (arg.queue and arg.s3fromq): if arg.prefix: prefix = arg.prefix # get queue name or its substitute for S3 objects from prefix if arg.typename: type_name = arg.typename else: type_name = queue.conf.get_prefixpair( prefix=prefix, includeS3=True, withBasename=arg.withbase) if not type_name: log.error("Could not get type from prefix %s" % prefix) return 1 log.warning("type_name=%s from prefix=%s" % (type_name, prefix)) else: type_name = arg.queue prefix, include = queue.conf.get_prefix_for_queue( type_name, withBasename=False) if not prefix: log.error("Could not get S3 prefix for queue %s" % type_name) return 1 if not general.get_es_connection().indices.exists_type( index=index_name, doc_type=type_name): # Create mapping if the queue has not been stored in Elasticsearch yet index_setup.add_type(index_name=index_name, type_name=type_name) conn_s3 = boto.connect_s3(aws_access_key_id=arg.aws_key, aws_secret_access_key=arg.aws_secret) bucket = conn_s3.get_bucket( arg.bucket) # connect to S3, get bucket ptr for arg.bucket attach_to_s3(index_name, s3prefix=prefix, bucket=bucket, type_name=type_name, tmpcopy=arg.tmpcopy, chunk_size=arg.chunk, startdate=arg.startdate, enddate=arg.enddate) else: if arg.typename: type_name = arg.typename else: type_name = arg.queue if not general.get_es_connection().indices.exists_type( index=index_name, doc_type=type_name): # Create mapping if the queue has not been stored in Elasticsearch yet index_setup.add_type(index_name=index_name, type_name=type_name) attach_to_queue(index_name=index_name, queue_name=arg.queue, type_name=type_name)
def execute(arg): logs.init(arg) log.info("*************************************") log.info("PSL 4 Elections pipeline initializing") log.info("tweet folder------> " + arg.tweetFolder) log.info("dataFolder--------> " + str(arg.dataFolder)) log.info("fromDate----------> " + arg.fromDate) log.info("toDate------------> " + arg.toDate) log.info("window------------> " + str(arg.window) + " day(s)") log.info("userThreshold-----> " + str(arg.userThreshold)) log.info("wordThreshold-----> " + str(arg.wordThreshold)) log.info("makePredictions---> " + arg.predictionFlag) log.info("*************************************") fromDate = datetime.strptime(arg.fromDate, "%d %b %Y") toDate = datetime.strptime(arg.toDate, "%d %b %Y") currentDate = fromDate iterCount = 1 membership = {} likes = {} hates = {} if arg.predictionFlag == '1': makePredictions(arg.dataFolder + '/' + arg.country, fromDate, toDate, arg.userThreshold, arg.configFile, arg.regressionFlag) sys.exit() while(currentDate <= toDate): log.info("iterCount--------------->" + str(iterCount)) log.info("processing PSL pipeline for %s" % (currentDate.strftime("%d %b %Y"))) log.debug("creating the directory substructure for current date") inputFolder = arg.dataFolder + '/' + arg.country + '/' + currentDate.strftime("%d%b") + '/inputs' outputFolder = arg.dataFolder + '/' + arg.country + '/' + currentDate.strftime("%d%b") + '/outputs' statsFolder = arg.dataFolder + '/' + arg.country + '/' + currentDate.strftime("%d%b") + '/stats' os.system('mkdir -p ' + inputFolder) os.system('mkdir -p ' + outputFolder) os.system('mkdir -p ' + statsFolder) nextDate = currentDate + timedelta(days=arg.window) if(nextDate <= toDate): log.debug("creating the directory substructure for next date") nextInputFolder = arg.dataFolder + '/' + arg.country + '/' + nextDate.strftime("%d%b") + '/inputs' nextOutputFolder = arg.dataFolder + '/' + arg.country + '/' + nextDate.strftime("%d%b") + '/outputs' nextStatsFolder = arg.dataFolder + '/' + arg.country + '/' + nextDate.strftime("%d%b") + '/stats' os.system('mkdir -p ' + nextInputFolder) os.system('mkdir -p ' + nextOutputFolder) os.system('mkdir -p ' + nextStatsFolder) if iterCount == 1: keywordList = [] # adding seed words to the list seedWordList = [] with open(arg.seedFile, 'r') as file: for line in file: word, group, weight = line.split(',') seedWordList.append(word) keywordList = list(set(keywordList).union(set(seedWordList))) log.info("copying the seedFile to inputFolder") os.system('cp ' + arg.seedFile + ' ' + inputFolder + '/seedWords.csv') preProcess(arg.tweetFolder, inputFolder, keywordList, currentDate, nextDate, arg.country) log.info("***********preProcess complete******************") executePSLCode(inputFolder, outputFolder, arg.classPathFile) log.info("***********PSL code complete********************") postProcess(outputFolder, nextInputFolder, statsFolder, arg.userThreshold, arg.wordThreshold, membership, likes, hates) log.info("**********postProcess complete*****************") log.info("deleting the database used for current iteration") os.system('rm /home/aravindan/Dropbox/git/ms_thesis/psl/electionLDADB*') iterCount += 1 currentDate = nextDate log.info("**********************************************************")
import xlrd import json import datetime import hashlib import os import argparse import sys from etool import logs from Util import calculator import sqlite3 as lite """ Initiate the Json data for all the Indices """ logs.init() __processor__ = os.path.basename(__file__.split(".")[0]) log = logs.getLogger(__processor__) def transcsv2json(xlsFile): wb = xlrd.open_workbook(xlsFile) sh = wb.sheet_by_name('Sheet1') #read the stock index from file stockIndex = sh.row_values(0,0)[0].split(" ")[0] stockPrices = [] for rownum in range(2,sh.nrows): try: time_tuple = xlrd.xldate_as_tuple(sh.row_values(rownum,0)[0],0) post_date = datetime.datetime.strftime(datetime.date(time_tuple[0],time_tuple[1],time_tuple[2]),"%Y-%m-%d") lastPrice = float(sh.row_values(rownum,0)[1]) previousCloseValue = float(sh.row_values(rownum,0)[3])
def main(): ap = args.get_parser() ap.add_argument('-t', '--tweetFolder', type=str, help='inputFolder pointing to PSLs output', default='/hdd/tweets/2012/oct') ap.add_argument('-df', '--dataFolder', type=str, help='folder to store intermediate outputs and final outputs', default='/home/aravindan/Dropbox/git/ms_thesis/data/psl') ap.add_argument('-ut', '--userThreshold', type=float, help='probability threshold of user membership', default=0.60) ap.add_argument('-wt', '--wordThreshold', type=float, help='probability threshold for vocab', default=0.70) ap.add_argument('-s', '--seedFile', type=str, help='seed File containing the intial seed vocabulary', default='/home/aravindan/Dropbox/git/ms_thesis/psl/seedWords/venezuela.csv') ap.add_argument('-c', '--country', type=str, help='country to model elections for', default='venezuela') ap.add_argument('-w', '--window', type=int, help='number of days of tweets used to infer', default=1) ap.add_argument('-d1', '--fromDate', type=str, help='date from which to track tweets', default='01 Oct 2012') ap.add_argument('-d2', '--toDate', type=str, help='date to which to track tweets', default='06 Oct 2012') ap.add_argument('-cp', '--classPathFile', type=str, help='file containing class path for PSL execution', default='/home/aravindan/Dropbox/git/ms_thesis/psl/classPathFile.txt') arg = ap.parse_args() logs.init(arg) log.info("*************************************") log.info("PSL 4 Elections pipeline initializing") log.info("tweet folder------> " + arg.tweetFolder) log.info("dataFolder--------> " + str(arg.dataFolder)) log.info("fromDate----------> " + arg.fromDate) log.info("toDate------------> " + arg.toDate) log.info("window------------> " + str(arg.window) + " day(s)") log.info("userThreshold-----> " + str(arg.userThreshold)) log.info("userThreshold-----> " + str(arg.userThreshold)) log.info("*************************************") fromDate = datetime.strptime(arg.fromDate, "%d %b %Y") toDate = datetime.strptime(arg.toDate, "%d %b %Y") currentDate = fromDate iterCount = 1 membership = {} vocab = {} filesProcessed = [] while(currentDate <= toDate): log.info("iterCount--------------->" + str(iterCount)) log.info("processing PSL pipeline for %s" % (currentDate.strftime("%d %b %Y"))) log.info("creating the directory substructure for current iteration") inputFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount) + '/inputs' outputFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount) + '/outputs' statsFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount) + '/stats' os.system('mkdir -p ' + inputFolder) os.system('mkdir -p ' + outputFolder) os.system('mkdir -p ' + statsFolder) log.info("creating the directory substructure for next iteration") nextInputFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount + 1) + '/inputs' nextOutputFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount + 1) + '/outputs' nextStatsFolder = arg.dataFolder + '/' + arg.country + '/iteration' + str(iterCount + 1) + '/stats' os.system('mkdir -p ' + nextInputFolder) os.system('mkdir -p ' + nextOutputFolder) os.system('mkdir -p ' + nextStatsFolder) if iterCount == 1: keywordList = [] # adding seed words to the list seedWordList = [] with open(arg.seedFile, 'r') as file: for line in file: word, group, weight = line.split(',') seedWordList.append(word) keywordList = list(set(keywordList).union(set(seedWordList))) log.info("copying the seedFile to inputFolder") os.system('cp ' + arg.seedFile + ' ' + inputFolder + '/seedWords.csv') fileDate, filesProcessed = preProcess(arg.tweetFolder, inputFolder, keywordList, currentDate, toDate, arg.country, list(set(filesProcessed))) log.info("***********preProcess complete******************") executePSLCode(inputFolder, outputFolder, arg.classPathFile) log.info("***********PSL code complete********************") keywordList = postProcess(outputFolder, nextInputFolder, statsFolder, arg.userThreshold, arg.wordThreshold, membership, vocab, seedWordList, currentDate) log.info("**********postProcess complete*****************") if fileDate > currentDate: currentDate = fileDate log.info("deleting the database used for current iteration") os.system('rm /home/aravindan/Dropbox/git/ms_thesis/psl/electionPSLDB*') iterCount += 1 log.info("**********************************************************")
def main(): ap = args.get_parser() ap.add_argument( "-t", "--tweetFolder", type=str, help="inputFolder pointing to PSLs output", default="/hdd/tweets/2012/oct" ) ap.add_argument( "-df", "--dataFolder", type=str, help="folder to store intermediate outputs and final outputs", default="/home/aravindan/Dropbox/git/ms_thesis/data/dqe", ) ap.add_argument( "-wt", "--wordThreshold", type=float, help="n-percent of words to propagate to next iteration", default=25 ) ap.add_argument( "-s", "--seedFile", type=str, help="seed File containing the intial seed vocabulary", default="/home/aravindan/Dropbox/git/ms_thesis/psl/seedWords/venezuela.csv", ) ap.add_argument("-d1", "--fromDate", type=str, help="date from which to track tweets", default="01 Oct 2012") ap.add_argument("-d2", "--toDate", type=str, help="date to which to track tweets", default="06 Oct 2012") ap.add_argument("-w", "--window", type=int, help="number of days of tweets used to infer", default=1) ap.add_argument("-c", "--country", type=str, help="country to execute the pipeline for", default="venezuela") arg = ap.parse_args() logs.init(arg) log.info("*************************************") log.info("PSL 4 Elections pipeline initializing") log.info("tweet folder------> " + arg.tweetFolder) log.info("dataFolder--------> " + str(arg.dataFolder)) log.info("fromDate----------> " + arg.fromDate) log.info("toDate------------> " + arg.toDate) log.info("window------------> " + str(arg.window) + " day(s)") log.info("country-----------> " + arg.country) log.info("wordThreshold-----> " + str(arg.wordThreshold) + "%") log.info("*************************************") fromDate = datetime.strptime(arg.fromDate, "%d %b %Y") toDate = datetime.strptime(arg.toDate, "%d %b %Y") currentDate = fromDate iterCount = 1 vocab = {} while currentDate <= toDate: log.info("iterCount--------------->" + str(iterCount)) log.info("processing PSL pipeline for %s" % (currentDate.strftime("%d %b %Y"))) log.info("creating the directory substructure for current date") outputFolder = arg.dataFolder + "/" + arg.country + "/" + currentDate.strftime("%d%b") os.system("mkdir -p " + outputFolder) nextDate = currentDate + timedelta(days=arg.window) if iterCount == 1: with open(arg.seedFile, "r") as file: for line in file: word, group, weight = line.split(",") if group in vocab: vocab[group][word] = weight else: vocab[group] = {} vocab[group][word] = weight counts = trackTweets(arg.tweetFolder, vocab, currentDate, nextDate, arg.country, arg.wordThreshold) log.info("***********trackTweets complete***************") # normalize the counts for each group for group in counts: max = 0 for word in counts[group]: if counts[group][word] > max: max = counts[group][word] for word in counts[group]: weight = counts[group][word] / max vocab[group][word] = weight # dumping the vocab learnt for group in vocab: with open(outputFolder + "/" + group + "_vocab.csv", "w") as f: sorted_tuples = sorted(vocab[group].iteritems(), key=operator.itemgetter(1), reverse=True) for (word, weight) in sorted_tuples: f.write(word + "," + str(weight)) f.write("\n") currentDate = nextDate iterCount += 1 log.info("*********************************************") log.info("************ALL iterations complete********************")
def main(): global CONFIG, VOCABULARY_FILE, WARNING_PORT, SURROGATE_PORT, __version__, KEY_ID, SECRET, T_EASTERN, T_UTC "Initiate the TimeZone Setting" T_UTC = pytz.utc T_EASTERN = pytz.timezone("US/Eastern") "Get the input arg" arg = parse_args() rege_date = arg.rege_date KEY_ID = arg.aws_key SECRET = arg.aws_secret logs.init(arg) queue.init(arg) conn = boto.connect_sdb(KEY_ID, SECRET) #initiate simpleDB domains surrogateDomain = get_domain(conn, arg.surrogate_domain) warningDomain = get_domain(conn, arg.warning_domain) WARNING_PORT = arg.warning_port SURROGATE_PORT = arg.surrogate_port "if rege_date is not none, it means to regenerate the past day's prediction" if not rege_date: "Normal predict" predict_date = arg.predict_date model_cfg = arg.model_cfg stock_list = None if arg.stock_list: stock_list = arg.stock_list "Get the Latest version of Config Object" f = open(model_cfg, "r") configObj = json.load(f) f.close() con_versionNum = max([int(v) for v in configObj.keys()]) CONFIG = configObj[str(con_versionNum)] "Get the Latest version of Trend Range Object" clusterTrends = json.load(sys.stdin) trend_versionNum = max([int(v) for v in clusterTrends.keys()]) CONFIG["trendRange"] = {"version": str(trend_versionNum), "range": clusterTrends[str(trend_versionNum)]} if not stock_list: stock_list = CONFIG["stocks"] #"Retrain the model configuration if current day is Saturday" #weekDay = datetime.strptime(predict_date,"%Y-%m-%d").weekday() #if weekDay == 5: # finalClusterProbability, finalClusterMatrix = re_training(surrogateDomain, predict_date, stock_list) # new_config = json.loads(json.dumps(CONFIG)) # new_config["clusterProbability"] = finalClusterProbability # new_config["clusterContribution"] = finalClusterMatrix # "Write back to configure file" # new_version_num = con_versionNum + 1 # new_config["version"] = new_version_num # configObj[str(new_version_num)] = new_config # with open(model_cfg, "w") as out_q: # out_q.write(json.dumps(configObj)) "Process stock each by each" for stock in stock_list: surrogate = process_single_stock(surrogateDomain, predict_date,stock) if surrogate: warning = warning_check(warningDomain, surrogate) else: "regenerate the old prediction" model_cfg = arg.model_cfg stock_list = None if arg.stock_list: stock_list = arg.stock_list "Get the version of Config Object for the indicated prediction" versionObj = get_predicion_version(warningDomain, rege_date) configVersionNum = versionObj["configVersion"] trendVersionNum = versionObj["trendVersion"] configObj = json.load(open(model_cfg)) if configVersionNum in configObj: CONFIG = configObj[configVersionNum] else: CONFIG = configObj["1"] "Get the Latest version of Trend Range Object" clusterTrends = json.load(sys.stdin) #get the latest version of the warning tmpVersion = int(trendVersionNum) while tmpVersion >= 1: if str(tmpVersion) in clusterTrends: trendVersionNum = str(tmpVersion) break else: tmpVersion -= 1 CONFIG["trendRange"] = {"version": str(trendVersionNum), "range": clusterTrends[trendVersionNum]} if not stock_list: stock_list = CONFIG["stocks"] "Process stock each by each" for stock in stock_list: replayIO = StringIO.StringIO() surrogate = process_single_stock(surrogateDomain, rege_date, stock, True, replayIO) if surrogate: warning = warning_check(warningDomain, surrogate, True, replayIO) replayInfo = replayIO.getvalue() weid = getwarningeid(surrogateDomain, rege_date, stock) with open("./demo/%s.txt" % weid, "w") as win: win.write(replayInfo) if conn: conn.close()
def __init__(self, wg_data=WG_DATA, co_admin_data=CO_ADMIN_DATA, priority_policy=PRIORITY_POLICY, debug=False): """ """ self.priority_policy = priority_policy self.debug = debug self.__version__ = "{0}-{1}-{2}-{3}-{4}".format( self.__class__.__name__, __version__, hashlib.md5(get_wg_data(wg_data).read()).hexdigest(), hashlib.md5(get_co_admin_data(co_admin_data).read()).hexdigest(), hashlib.md5(" ".join(self.priority_policy)).hexdigest()) if self.debug: try: logs.init() except IOError: # , err: logs.init(logfile=self.__class__.__name__.lower()) self.log = logs.getLogger("{0}-{1}".format( self.__class__.__name__, __version__.replace('.', '_'))) # 1. load country and admin1 level geo data f = get_co_admin_data(co_admin_data) dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t") f.seek(0) reader = csv.DictReader(f, dialect=dialect, fieldnames=CO_ADMIN_FIELDS) # NOTE: # Known conflicts b/w codes of countries and other admins # co Colombia ('Colombia', 'C\xc3\xb3rdoba') # cl Chile ('Colombia', 'Caldas') # ar Argentina ('Colombia', 'Arauca') # sv El Salvador ('El Salvador', 'San Vicente') # prep lookup dictionaries # key__value # countries self.co_code = {} self.co_names = {} self.co_aliases = {} self.co_capital_cities = {} # admin1 self.admin_code = {} self.admin_name = {} # assumes countries appear first when reading data from # lac_co_admin TODO BAD! for r in reader: for k in r.keys(): r[k] = r[k].strip() lat = float_or_none(r['latitude']) lon = float_or_none(r['longitude']) code = object_or_none(r['iso_3166_code']) rid = int_or_none(r["id"]) if r['type'] == 'country': # country if code: self.co_code[code] = r['name'] self.co_names[nstr(r['name'])] = (rid, lat, lon, code, r['name']) self.co_capital_cities[nstr(r['capital_city'])] =\ (r['capital_city'], r['name']) aliases = r['alt_names'].split(',') self.co_aliases.update({nstr(alias.strip()): r['name'] for alias in aliases}) else: if self.debug: self.log.error("Bad data country {0} Code {1}".format( r['name'], code)) elif r['type'] == 'admin': # admin admin, co = r['full_name'].split(',') admin, co = admin.strip(), co.strip() if code: if code not in self.admin_code: self.admin_code[code] = [] self.admin_code[code].append((co, admin)) co1, a = nstr(co), nstr(admin) if a not in self.admin_name: self.admin_name[a] = {} if co1 not in self.admin_name[a]: self.admin_name[a][co1] = (rid, lat, lon, code, admin, co) f.close() # 2. load (world-gazeteer) city level geo data f = get_wg_data(wg_data) dialect = csv.Sniffer().sniff(f.read(10240), delimiters="\t") f.seek(0) reader = csv.DictReader(f, dialect=dialect, fieldnames=WG_FIELDS) self.ci_aliases = {} # main data store for geocoding self.data = [] counter = 0 ci_set = set() for r in reader: for k in r.keys(): r[k] = r[k].strip() # get alias names for cities ci_names = [a.strip() for a in r['alt_names'].split(',') if len(a.strip()) > 0] ci_names.extend([a.strip() for a in r['orig_names'].split(',') if len(a.strip()) > 0]) for ci in ci_names: k = (nstr(ci), nstr(r['country'])) a1 = nstr(r['admin1']) if k not in self.ci_aliases: self.ci_aliases[k] = {a1: set([r['name']])} elif a1 not in self.ci_aliases[k]: self.ci_aliases[k][a1] = set([r['name']]) else: # Cases where different cities for same # admin-country pair have the same alias self.ci_aliases[k][a1].add(r['name']) # add ci name aliases into ci_set ci_set.add(nstr(ci)) # store only cannonical cities names self.data.append((counter, (r['name'], r['country'], r['admin1'], object_or_none(r['admin2']), object_or_none(r['admin3']), int_or_none(r['pop']), float_or_none(r['latitude']) / 100, float_or_none(r['longitude']) / 100, int(r['id']), int(r['padded'])))) counter += 1 self.coordinates = {} # cases where admin1 and city share the same name # extended feature/hack #1 to resolve city when # only country and admin1 are specified self.same_ci_a1_name = {} for i, (n, c, a1, a2, a3, p, lat, lon, i_d, pad) in self.data: nn, nc, na1 = nstr(n), nstr(c), nstr(a1) self.coordinates[(lat, lon)] = i if nn == na1 and pad == 0: self.same_ci_a1_name[(nc, na1)] = n ci_set.add(nn) # store (lat, lon) self.kdtree = KDTree([[i, j] for i, j in self.coordinates.keys() if i is not None and j is not None]) # build regular expr dicts co_set = set(self.co_names.keys()) # add country name aliases into co_set co_set.update(self.co_aliases.keys()) self.co_reg = ManyRE(co_set) self.ci_reg = ManyRE(ci_set) # add admin1 name aliases into admin1_set admin1_set = set(self.admin_name.keys()) # build regular expression stores for co-admin1-ci self.admin1_reg = ManyRE(admin1_set) # add stopwords to prevent any 2-letter word in common usage # to be mis-interpretted as country or admin code two_letter_stop_words = set( ['BE', 'WE', '\xc3\xa0', 'YO', 'DO', 'YA', 'DE', 'DA', 'HA', 'BY', 'HE', 'AL', 'NI', 'LE', 'NO', 'LO', 'TU', 'TO', 'TI', 'TE', 'EM', 'EL', 'EN', 'IS', 'OS', 'AM', 'IT', 'AO', 'AN', 'AS', 'AT', 'IN', 'EU', 'ES', 'IF', 'ME', 'ON', 'OF', 'LA', 'MI', 'UP', 'SU', 'UM', 'UN', 'SO', 'NA', 'OU', 'MY', 'OR', 'SE', 'US']) self.co_code_reg = ManyRE([sw for sw in self.co_code.keys() if sw not in two_letter_stop_words]) self.admin1_code_reg1 = ManyRE(self.admin_code.keys()) self.admin1_code_reg2 = ManyRE([sw for sw in self.admin_code.keys() if sw not in two_letter_stop_words]) self.bguess = {} for i, (city, country, admin1, a2, a3, p, la, lo, i_d, pad)\ in self.data: ci, co, a = nstr(city), nstr(country), nstr(admin1) # value is list of admin1's that correspond to ci-co key # ci-co makes dictionary flatter # choose not to use co-admin1-ci as key to add more flexibility # for lookups if ci in self.bguess: if co in self.bguess[ci]: if a in self.bguess[ci][co]: # store original wg-records marked with pad = 0 # to head of the queue if pad == 0: self.bguess[ci][co][a].appendleft(i) else: self.bguess[ci][co][a].append(i) else: self.bguess[ci][co][a] = deque([i]) else: self.bguess[ci][co] = {a: deque([i])} else: self.bguess[ci] = {co: {a: deque([i])}}