def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--input_dir', required=True) parser.add_argument('--output_dir', required=True) parser.add_argument('--overwrite', action='store_true') parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: tickers = fp.read().splitlines() logging.info('Processing %d tickers' % len(tickers)) for i in range(len(tickers)): ticker = tickers[i] if ticker in SKIPPED_TICKERS: logging.warning('%d/%d: skipped %s' % (i+1, len(tickers), ticker)) continue logging.info('%d/%d: %s' % (i+1, len(tickers), ticker)) input_path = '%s/%s.csv' % (args.input_dir, ticker) output_path = '%s/%s.csv' % (args.output_dir, ticker) if not path.isfile(input_path): logging.warning('Input file does not exist: %s' % input_path) continue if path.isfile(output_path) and not args.overwrite: logging.warning('Output file exists and not overwritable: %s' % output_path) continue parse(input_path, output_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--input_dir', required=True) parser.add_argument('--output_dir', required=True) parser.add_argument('--overwrite', action='store_true') parser.add_argument('--verbose', action='store_true') args = parser.parse_args() # Sanity check. assert args.input_dir != args.output_dir utils.setup_logging(args.verbose) # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: tickers = fp.read().splitlines() logging.info('Processing %d tickers' % len(tickers)) for i in range(len(tickers)): ticker = tickers[i] logging.info('%d/%d: %s' % (i+1, len(tickers), ticker)) input_path = '%s/%s.csv' % (args.input_dir, ticker.replace('^', '_')) if not path.isfile(input_path): logging.warning('Input file is missing: %s' % input_path) continue output_path = '%s/%s.csv' % (args.output_dir, ticker.replace('^', '_')) if path.isfile(output_path) and not args.overwrite: logging.warning('Output file exists and not overwritable: %s' % output_path) continue sample(input_path, output_path)
def main(): _mkdirs(SRCDIR, INSTALLDIR) setup_logging() fetch_and_build() for db in ('sqlite3', 'mysql'): shell('rm -rf {}/*'.format(INSTALLDIR)) setup_and_test(db)
def __init__(self, name, port, pin, scale_factor, zero_point): logger = logging.getLogger('log') setup_logging(name) try: import RPi.GPIO as GPIO except ImportError: logger.critical('[Servo Socket]: GPIO not configured properly!') sys.exit(1) self.port = port self.pin = pin self.scale_factor = scale_factor self.zero_point = zero_point # Configure the servo GPIO.setmode(GPIO.BOARD) GPIO.setup(self.pin, GPIO.OUT) # Define the socket parameters HOST = '' PORT = self.port connection = socket.socket(socket.AF_INET, socket.SOCK_STREAM) connection.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # Bind socket to local host and port try: connection.bind((HOST, PORT)) except socket.error, msg: logger.critical('[Servo Socket]: Bind failed. Error Code: ' + str(msg[0]) + ' Message ' \ + msg[1]) sys.exit()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--input_dir', required=True) parser.add_argument('--from_ticker', default='') parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: lines = fp.read().splitlines() tickers = [] for line in lines: if line >= args.from_ticker: tickers.append(line) logging.info('Processing %d tickers' % len(tickers)) for i in range(len(tickers)): ticker = tickers[i] logging.info('%d/%d: %s' % (i+1, len(tickers), ticker)) input_path = '%s/%s.csv' % (args.input_dir, ticker.replace('^', '_')) if not path.isfile(input_path): logging.warning('Input file does not exist: %s' % input_path) continue validate(input_path)
def start_tracker(): """Start the Torrent Tracker. """ # parse commandline options parser = OptionParser() parser.add_option('-p', '--port', help='Tracker Port', default=0) parser.add_option('-b', '--background', action='store_true', default=False, help='Start in background') parser.add_option('-d', '--debug', action='store_true', default=False, help='Debug mode') (options, args) = parser.parse_args() # setup directories utils.create_pytt_dirs() # setup logging utils.setup_logging(options.debug) try: # start the torrent tracker run_app(int(options.port) or utils.get_config().getint('tracker', 'port')) except KeyboardInterrupt: logging.info('Tracker Stopped.') utils.close_db() sys.exit(0) except Exception, ex: logging.fatal('%s' % str(ex)) utils.close_db() sys.exit(-1)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--total_assets_path', required=True) parser.add_argument('--intangible_assets_path', required=True) parser.add_argument('--total_liabilities_path', required=True) parser.add_argument('--prices_path', required=True) parser.add_argument('--outstanding_shares_path', required=True) parser.add_argument('--output_path', required=True) parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) ta_map = utils.read_map(args.total_assets_path) tl_map = utils.read_map(args.total_liabilities_path) p_map = utils.read_map(args.prices_path) s_map = utils.read_map(args.outstanding_shares_path) tickers = ta_map.keys() & tl_map.keys() & p_map.keys() & s_map.keys() # intangible assets are 0 by default ia_map = dict() for t in tickers: ia_map[t] = 0.0 ia_part = utils.read_map(args.intangible_assets_path) for k, v in ia_part.items(): ia_map[k] = v with open(args.output_path, 'w') as fp: for ticker in sorted(tickers): output = ((ta_map[ticker] - ia_map[ticker] - tl_map[ticker]) / s_map[ticker] / p_map[ticker]) print('%s %f' % (ticker, output), file=fp)
def run(): """ Main loop. Run this TA for ever """ try: meta_configs, stanza_configs = conf.parse_modinput_configs( sys.stdin.read()) except Exception as ex: _LOGGER.error("Failed to setup config for manager TA: %s", ex.message) _LOGGER.error(traceback.format_exc()) raise if not stanza_configs: _LOGGER.info("No config, exiting...") return 0 if stanza_configs: loglevel = stanza_configs[0].get("loglevel", "INFO") _LOGGER.info("Setup logging level=%s", loglevel) for log_file in all_logs: utils.setup_logging(log_file, loglevel, True) ta_manager = tm.TAManager(meta_configs, stanza_configs[0]) _setup_signal_handler(ta_manager) ta_manager.run()
def __init__(self, name): logger = logging.getLogger('log') setup_logging() try: import smbus except ImportError: logger.critical('[Arduino Socket]: SMBUS not configured properly!') sys.exit(1) arduino_device = None # Global arduino_device variable states = None # Define the socket parameters HOST = '' PORT = 7893 connection = socket.socket(socket.AF_INET, socket.SOCK_STREAM) connection.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # Bind socket to local host and port try: connection.bind((HOST, PORT)) except socket.error, msg: logger.critical('[Arduino Socket]: Bind failed. Error Code: ' + str(msg[0]) + ' Message ' \ + msg[1]) sys.exit()
def main(): _mkdirs(SRCDIR, INSTALLDIR) setup_logging() fetch_and_build() for db in ('sqlite3', 'mysql'): if db == 'mysql': shell('mysqladmin -u root password %s' % MYSQL_ROOT_PASSWD) for i in ('prompt', 'auto'): shell('rm -rf {}/*'.format(INSTALLDIR)) setup_and_test(db, i)
def setup(): global copied, uploaded, last_scanned, warnings copied = open_shelf("copied.db") uploaded = open_shelf("uploaded.db") last_scanned = [] log_path = os.path.join(PROJECT_PATH, "smugsync.log") utils.setup_logging(log_path) warnings = StringIO.StringIO() handler = logging.StreamHandler(warnings) handler.setLevel(logging.WARNING) logging.getLogger("").addHandler(handler)
def start(): """ Запуск планировщика """ setup_logging(logging.DEBUG if settings.DEBUG is True else logging.INFO) queue = Queue() # Start scheduler subprocess Process(target=scheduler_process, args=(queue, os.getpid())).start() # To support Ctrl+C in debug mode if not settings.DEBUG: Thread(target=amqp_thread, args=(queue, )).start() else: Process(target=amqp_thread, args=(queue, )).start()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--price_dir', required=True) parser.add_argument('--yyyy_mm', required=True) parser.add_argument('--k', default='12') parser.add_argument('--output_path', required=True) parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: tickers = fp.read().splitlines() logging.info('Processing %d tickers' % len(tickers)) k = int(args.k) assert k > 0 volume_map = dict() for i in range(len(tickers)): ticker = tickers[i] logging.info('%d/%d: %s' % (i+1, len(tickers), ticker)) input_path = '%s/%s.csv' % (args.price_dir, ticker.replace('^', '_')) if not path.isfile(input_path): logging.warning('Input file is missing: %s' % input_path) continue with open(input_path, 'r') as fp: lines = fp.read().splitlines() vmap = dict() assert len(lines) > 0 for j in range(1, len(lines)): d, o, h, l, c, v, a = lines[j].split(',') d = d[:7] if args.yyyy_mm < d: continue if distance(args.yyyy_mm, d) >= k: break v = float(v) * float(a) if d in vmap: vmap[d] += v else: vmap[d] = v assert len(vmap) <= k if len(vmap) < k: #max(1, k/2): logging.warning('Could not find enough data for %s' % ticker) continue volume_map[ticker] = sum(vmap.values()) / len(vmap) with open(args.output_path, 'w') as fp: for ticker in sorted(volume_map.keys()): print('%s %f' % (ticker, volume_map[ticker]), file=fp)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--price_sample_dir', required=True) parser.add_argument('--market_sample_path', required=True) parser.add_argument('--yyyy_mm', required=True) parser.add_argument('--k', required=True) parser.add_argument('--output_path', required=True) parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) k = int(args.k) assert k > 0 market_samples = read_samples(args.market_sample_path) curr_date = args.yyyy_mm prev_date = compute_date(curr_date, k) logging.info('current date = %s, previous date = %s' % (curr_date, prev_date)) assert curr_date in market_samples assert prev_date in market_samples # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: tickers = fp.read().splitlines() logging.info('Processing %d tickers' % len(tickers)) excess_map = dict() for i in range(len(tickers)): ticker = tickers[i] assert ticker.find('^') == -1 # ^GSPC should not be in tickers. logging.info('%d/%d: %s' % (i+1, len(tickers), ticker)) stock_sample_path = '%s/%s.csv' % (args.price_sample_dir, ticker) if not path.isfile(stock_sample_path): logging.warning('Input file does not exist: %s' % stock_sample_path) continue stock_samples = read_samples(stock_sample_path) if (curr_date not in stock_samples or prev_date not in stock_samples): logging.warning('Insufficient data for %s' % ticker) continue excess = compute_excess( stock_samples[prev_date], stock_samples[curr_date], market_samples[prev_date], market_samples[curr_date]) excess_map[ticker] = excess with open(args.output_path, 'w') as fp: for ticker in sorted(excess_map.keys()): print('%s %f' % (ticker, excess_map[ticker]), file=fp)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--from_ticker', default='') parser.add_argument('--report_type', required=True) parser.add_argument('--input_dir', required=True) parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) rt = args.report_type assert rt in TYPE_MAP, ( 'report_type must be one of %s' % TYPE_MAP.keys()) (req_map, opt_map, add_map, skip_map) = TYPE_MAP[rt] # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: tickers = fp.read().splitlines() logging.info('Processing %d tickers' % len(tickers)) total, opts, quarterly = 0, 0, 0 common_keys = None for i in range(len(tickers)): ticker = tickers[i] if ticker < args.from_ticker or ticker in SKIPPED_TICKERS: logging.info('%d/%d: skipped %s' % (i+1, len(tickers), ticker)) continue logging.info('%d/%d: %s' % (i+1, len(tickers), ticker)) input_path = '%s/%s.csv' % (args.input_dir, ticker) if not path.isfile(input_path): logging.warning('Input file does not exist: %s' % input_path) continue keys, has_opt, is_quarterly = validate( input_path, ticker, req_map, opt_map, add_map, skip_map) if common_keys is None: common_keys = keys else: common_keys &= keys if has_opt: opts += 1 if is_quarterly: quarterly += 1 total += 1 logging.info('%d out of %d have optional metrics' % (opts, total)) logging.info('%d out of %d are consecutive quarters' % (quarterly, total)) logging.info('Common keys: %s' % common_keys)
def __init__(self, meta_configs, stanza_configs): """ @meta_configs: a dict like object, implement dict.get/[] like interfaces to get the value for a key. meta_configs shall at least contain {"server_uri": uri, "checkpoint_dir": dir, "session_key": key} key/value pairs @stanza_configs: a list like object containing a list of dict like object. Each element shall implement dict.get/[] like interfaces to get the value for a key. Each element in the list shall at least contain """ import timer_queue as tq import ta_configure_manager as conf_mgr import servers import ta_conf_client as tcc self.meta_configs = meta_configs appname = utils.get_appname_from_path(op.abspath(__file__)) meta_configs["appname"] = appname self.wakeup_queue = Queue.Queue() self.conf_manager = conf_mgr.TAConfigureManager(meta_configs) self.timer_queue = tq.TimerQueue() self.pub_server = servers.PubServer(stanza_configs) self.rep_server = servers.RepServer(stanza_configs, self._handle_request) self.conf_client = tcc.TAConfClient(stanza_configs["repserver"], meta_configs["server_uri"], meta_configs["session_key"]) self._state_logger = utils.setup_logging("ta_state") self._started = False
def __init__(self, take_ownership=True, # Tor dies when the Crawler does torrc_config={"CookieAuth": "1"}, tor_log="/var/log/tor/tor.log", tor_cell_log="/var/log/tor/tor_cell_seq.log", control_port=9051, socks_port=9050, run_in_xvfb=True, tbb_path=join("/opt","tbb","tor-browser_en-US"), tb_log_path=join(_log_dir,"firefox.log"), tb_tor_cfg=USE_RUNNING_TOR, page_load_timeout=20, wait_on_page=5, wait_after_closing_circuits=0, restart_on_sketchy_exception=True, additional_control_fields={}, db_handler=None): self.logger = setup_logging(_log_dir, "crawler") self.torrc_config = torrc_config self.socks_port = find_free_port(socks_port, control_port) self.torrc_config.update({"SocksPort": str(self.socks_port)}) self.control_port = find_free_port(control_port, self.socks_port) self.torrc_config.update({"ControlPort": str(self.control_port)}) self.torrc_config.update({"Log": "INFO file {}".format(tor_log)}) self.logger.info("Starting tor process with config " "{torrc_config}.".format(**locals())) self.tor_process = launch_tor_with_config(config=self.torrc_config, take_ownership=take_ownership) self.authenticate_to_tor_controlport() self.logger.info("Opening cell log stream...") self.cell_log = open(tor_cell_log, "rb") if run_in_xvfb: self.logger.info("Starting Xvfb...") self.run_in_xvfb = True self.virtual_framebuffer = start_xvfb() self.logger.info("Starting Tor Browser...") self.tb_driver = TorBrowserDriver(tbb_path=tbb_path, tor_cfg=tb_tor_cfg, tbb_logfile_path=tb_log_path, socks_port=self.socks_port, control_port=self.control_port) self.wait_after_closing_circuits = wait_after_closing_circuits self.page_load_timeout = page_load_timeout self.tb_driver.set_page_load_timeout(page_load_timeout) self.wait_on_page = wait_on_page self.restart_on_sketchy_exception = restart_on_sketchy_exception self.control_data = self.get_control_data(page_load_timeout, wait_on_page, wait_after_closing_circuits, additional_control_fields) self.db_handler = db_handler if db_handler: self.crawlid = self.db_handler.add_crawl(self.control_data)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--report_type', required=True) parser.add_argument('--period', required=True) parser.add_argument('--output_dir', required=True) parser.add_argument('--overwrite', action='store_true') parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) rt = args.report_type assert rt == 'is' or rt == 'bs' or rt == 'cf', ( 'report_type must be one of "is", "bs" and "cf"') p = args.period assert p == '3' or p == '12', 'period must be "3" or "12"' # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: tickers = fp.read().splitlines() logging.info('Processing %d tickers' % len(tickers)) sl, fl = [], [] # Lists of tickers succeeded/failed to download. for i in range(len(tickers)): ticker = tickers[i] logging.info('%d/%d: %s' % (i+1, len(tickers), ticker)) output_path = '%s/%s.csv' % (args.output_dir, ticker) dl = False if path.isfile(output_path): action = 'skipping' if args.overwrite: remove(output_path) action = 'overwriting' dl = True logging.warning('Output file exists: %s, %s' % (output_path, action)) else: dl = True if dl: ok = download(ticker, rt, p, output_path) if ok: sl.append(ticker) else: fl.append(ticker) logging.info('Downloaded %d tickers, failed %d tickers' % (len(sl), len(fl))) logging.info('Downloaded tickers: %s' % sl) logging.info('Failed tickers: %s' % fl)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--er1_path', required=True) parser.add_argument('--er12_path', required=True) parser.add_argument('--tv2mc_path', required=True) parser.add_argument('--er2_path', required=True) parser.add_argument('--e2p_path', required=True) parser.add_argument('--roe_path', required=True) parser.add_argument('--b2p_path', required=True) parser.add_argument('--er6_path', required=True) parser.add_argument('--cf2p_path', required=True) parser.add_argument('--output_path', required=True) parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) er1_map = utils.read_map(args.er1_path) er12_map = utils.read_map(args.er12_path) tv2mc_map = utils.read_map(args.tv2mc_path) er2_map = utils.read_map(args.er2_path) e2p_map = utils.read_map(args.e2p_path) roe_map = utils.read_map(args.roe_path) b2p_map = utils.read_map(args.b2p_path) er6_map = utils.read_map(args.er6_path) cf2p_map = utils.read_map(args.cf2p_path) tickers = (er1_map.keys() & er12_map.keys() & tv2mc_map.keys() & er2_map.keys() & e2p_map.keys() & roe_map.keys() & b2p_map.keys() & er6_map.keys() & cf2p_map.keys()) logging.info('%d tickers' % len(tickers)) logging.info('total weight: %f' % (ER1 + ER12 + TV2MC + ER2 + E2P + ROE + B2P + ER6 + CF2P)) with open(args.output_path, 'w') as fp: for t in sorted(tickers): score = (er1_map[t] * ER1 + er12_map[t] * ER12 + tv2mc_map[t] * TV2MC + er2_map[t] * ER2 + e2p_map[t] * E2P + roe_map[t] * ROE + b2p_map[t] * B2P + er6_map[t] * ER6 + cf2p_map[t] * CF2P) / 100 # accounting for % print('%s %f' % (t, score), file=fp)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--net_income_path', required=True) parser.add_argument('--total_equity_path', required=True) parser.add_argument('--output_path', required=True) parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) ni_map = utils.read_map(args.net_income_path) e_map = utils.read_map(args.total_equity_path) tickers = ni_map.keys() & e_map.keys() with open(args.output_path, 'w') as fp: for ticker in sorted(tickers): output = ni_map[ticker] / e_map[ticker] print('%s %f' % (ticker, output), file=fp)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--prices_path', required=True) parser.add_argument('--outstanding_shares_path', required=True) parser.add_argument('--output_path', required=True) parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) p_map = utils.read_map(args.prices_path) s_map = utils.read_map(args.outstanding_shares_path) tickers = p_map.keys() & s_map.keys() with open(args.output_path, 'w') as fp: for ticker in sorted(tickers): output = p_map[ticker] * s_map[ticker] print('%s %f' % (ticker, output), file=fp)
def setup_logging(self, level='DEBUG'): """ Sets up the generic logging of the worker :param level: level of the logging, INFO, DEBUG, WARN :return: no return """ return utils.setup_logging(__file__, self.__class__.__name__)
def test_setup_logging(self): logname = "ta_frmk_unittest" logfile = utils.make_splunk_path(["var", "log", "splunk", "%s.log" % logname]) try: os.remove(logfile) except OSError: pass logger = utils.setup_logging(logname, "DEBUG") logger.debug("ta_unittest_frmk_debug") logger.info("ta_unittest_frmk_info") logger.error("ta_unittest_frmk_error") utils.setup_logging(logname, "INFO", True) logger.debug("ta_unittest_frmk_debug") logger.info("ta_unittest_frmk_info") logger.error("ta_unittest_frmk_error") utils.setup_logging(logname, "ERROR", True) logger.debug("ta_unittest_frmk_debug") logger.info("ta_unittest_frmk_info") logger.error("ta_unittest_frmk_error") with open(logfile) as f: logs = f.readlines() self.assertEqual(len(logs), 6) m = re.search(r"DEBUG\s+\d+\s+-\s+ta_unittest_frmk_debug$", logs[0]) self.assertIsNotNone(m) m = re.search(r"INFO\s+\d+\s+-\s+ta_unittest_frmk_info$", logs[1]) self.assertIsNotNone(m) m = re.search(r"ERROR\s+\d+\s+-\s+ta_unittest_frmk_error$", logs[2]) self.assertIsNotNone(m) m = re.search(r"INFO\s+\d+\s+-\s+ta_unittest_frmk_info$", logs[3]) self.assertIsNotNone(m) m = re.search(r"ERROR\s+\d+\s+-\s+ta_unittest_frmk_error$", logs[4]) self.assertIsNotNone(m) m = re.search(r"ERROR\s+\d+\s+-\s+ta_unittest_frmk_error$", logs[5]) self.assertIsNotNone(m)
def __init__(self, repserver_ip_port, splunkd_uri, session_key): self._req_client = cb.ReqClient(repserver_ip_port) self._splunkd_uri = splunkd_uri self._session_key = session_key self._conf_thr = threading.Thread(target=self._monitor_and_generate) self._shutdown_q = Queue.Queue() self._tasks_need_resent = {} self._heartbeat_logger = utils.setup_logging("ta_heartbeat") self._started = False self._stopped = False
def main(): parser = argparse.ArgumentParser() parser.add_argument('--input_path', required=True) parser.add_argument('--output_data_path', required=True) parser.add_argument('--output_index_path', required=True) parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) with open(args.input_path, 'r') as fp: lines = fp.read().splitlines() # This block below is to keep the output data in sync with the ones # produced by split_data_for_cv.py. I.e. the date and ticker of each # input line are swapped (such that date goes before ticker), and the # lines are sorted (by date and then by ticker). # Swap date and ticker in place. item_count = -1 for i in range(len(lines)): items = lines[i].split(' ') if item_count < 0: item_count = len(items) else: assert item_count == len(items) items[0], items[1] = items[1], items[0] lines[i] = ' '.join(items) # This will sort lines by entry and then ticker. lines.sort() data_fp = open(args.output_data_path, 'w') index_fp = open(args.output_index_path, 'w') for line in lines: items = line.split(' ') assert len(items) > 3 data = '%s %s' % (utils.make_label(float(items[2]), False), ' '.join(items[3:])) index = ' '.join(items[:2]) print(data, file=data_fp) print(index, file=index_fp) data_fp.close() index_fp.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--scores_path', required=True) parser.add_argument('--prices_path', required=True) parser.add_argument('--mc_path', required=True) parser.add_argument('--output_path', required=True) parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) s_map = utils.read_map(args.scores_path) p_map = utils.read_map(args.prices_path) mc_map = utils.read_map(args.mc_path) tickers = s_map.keys() & p_map.keys() & mc_map.keys() with open(args.output_path, 'w') as fp: for ticker in sorted(tickers): if p_map[ticker] < MIN_PRICE: continue if mc_map[ticker] < MIN_MC: continue print('%s %f' % (ticker, s_map[ticker]), file=fp)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--output_dir', required=True) parser.add_argument('--overwrite', action='store_true') parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: tickers = fp.read().splitlines() logging.info('Processing %d tickers' % len(tickers)) sl, fl = [], [] # Lists of tickers succeeded/failed to download. for i in range(len(tickers)): ticker = tickers[i] logging.info('%d/%d: %s' % (i+1, len(tickers), ticker)) output_path = '%s/%s.csv' % (args.output_dir, ticker.replace('^', '_')) dl = False if path.isfile(output_path): action = 'skipping' if args.overwrite: remove(output_path) action = 'overwriting' dl = True logging.warning('Output file exists: %s, %s' % (output_path, action)) else: dl = True if dl: ok = download(ticker, output_path) if ok: sl.append(ticker) else: fl.append(ticker) logging.info('Downloaded %d tickers, failed %d tickers' % (len(sl), len(fl))) logging.info('Downloaded tickers: %s' % sl) logging.info('Failed tickers: %s' % fl)
def main(conf_file): utils.setup_logging(False) logger = logging.getLogger("boten") config = boten.core.get_config(init=conf_file) sqs_conn = sqs.connect_to_region(config['config']['aws_region']) queue = sqs_conn.get_queue(config['config']['queue_name']) bots = init_bots() logger.info('bots loaded [{}]'.format(",".join(bots.keys()))) while True: logger.info('polling for new job') with utils.poll_sqs(queue) as payload: logger.info('Got new job') bot_name = payload['command'][1:] if payload['token'] != config[bot_name]['slack_token']: logger.warning('Got unauthorized slack command') logger.warning(payload) continue payload['subcommand'] = payload['text'].partition(' ')[0] payload['args'] = payload['text'].partition(' ')[2] p = multiprocessing.Process(target=run_payload, args=(bots[bot_name], payload, logger)) p.start()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--sample_dir', required=True) parser.add_argument('--market_sample_path', required=True) parser.add_argument('--output_dir', required=True) parser.add_argument('--er_months', default=ER_MONTHS) parser.add_argument('--ev_months', default=EV_MONTHS) parser.add_argument('--overwrite', action='store_true') parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) market_samples = utils.read_samples(args.market_sample_path) er_months = [int(m) for m in args.er_months.split(',')] ev_months = [int(m) for m in args.ev_months.split(',')] # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: tickers = fp.read().splitlines() logging.info('Processing %d tickers' % len(tickers)) for i in range(len(tickers)): ticker = tickers[i] assert ticker.find('^') == -1 # ^GSPC should not be in tickers. logging.info('%d/%d: %s' % (i+1, len(tickers), ticker)) stock_sample_path = '%s/%s.csv' % (args.sample_dir, ticker) if not path.isfile(stock_sample_path): logging.warning('Input file does not exist: %s' % stock_sample_path) continue # The output format is no longer csv. Use txt instead. output_path = '%s/%s.txt' % (args.output_dir, ticker) if path.isfile(output_path) and not args.overwrite: logging.warning('Output file exists: %s, skipping' % output_path) continue stock_samples = utils.read_samples(stock_sample_path) compute_features(stock_samples, market_samples, er_months, ev_months, output_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--ticker_file', required=True) parser.add_argument('--price_sample_dir', required=True) parser.add_argument('--yyyy_mm', required=True) parser.add_argument('--output_path', required=True) parser.add_argument('--verbose', action='store_true') args = parser.parse_args() utils.setup_logging(args.verbose) # Tickers are listed one per line. with open(args.ticker_file, 'r') as fp: tickers = fp.read().splitlines() logging.info('Processing %d tickers' % len(tickers)) price_map = dict() for i in range(len(tickers)): ticker = tickers[i] logging.info('%d/%d: %s' % (i+1, len(tickers), ticker)) input_path = '%s/%s.csv' % (args.price_sample_dir, ticker.replace('^', '_')) if not path.isfile(input_path): logging.warning('Input file is missing: %s' % input_path) continue with open(input_path, 'r') as fp: lines = fp.read().splitlines() found = False for line in lines: if line.startswith(args.yyyy_mm): d, v, p = line.split(' ') price_map[ticker] = float(p) found = True break if not found: logging.warning('Could not find current price data for %s' % ticker) with open(args.output_path, 'w') as fp: for ticker in sorted(price_map.keys()): print('%s %.2f' % (ticker, price_map[ticker]), file=fp)
def main(args): "Put all the pieces together" if args.dump_per_instance_results: args.dump = True if args.dump: args.disable_tqdm = True if len(args.logfile.name) == 0: basename_fusion = [ str(i.with_suffix('').with_name(i.stem)) for i in args.snapshot ] args.logfile = Path('-'.join(basename_fusion) + '_corpus-eval') if args.logfile.exists(): raise ValueError( f'{args.logfile} already exists. Please provide a logfile or' 'backup existing results.') setup_logging(args) logging.info('Corpus Retrieval Evaluation for CAL/MCN') logging.info(f'Git revision hash: {get_git_revision_hash()}') load_hyperparameters(args) logging.info(args) engine_prm = {} if args.arch == 'MCN': args.dataset = 'UntrimmedMCN' args.engine = 'MomentRetrievalFromProposalsTable' elif args.arch == 'SMCN': args.dataset = 'UntrimmedSMCN' args.engine = 'MomentRetrievalFromClipBasedProposalsTable' else: ValueError('Unknown/unsupported architecture') logging.info('Loading dataset') dataset_novisual = True dataset_cues = {feat: None for feat in args.tags} if args.h5_path: for i, key in enumerate(args.tags): dataset_cues[key] = {'file': args.h5_path[i]} dataset_novisual = False clip_length = None else: clip_length = args.clip_length proposals_interface = proposals.__dict__[args.proposal_interface]( args.min_length, args.scales, args.stride) dataset_setup = dict(json_file=args.test_list, cues=dataset_cues, loc=args.loc, context=args.context, debug=args.debug, eval=True, no_visual=dataset_novisual, proposals_interface=proposals_interface, clip_length=clip_length) dataset = dataset_untrimmed.__dict__[args.dataset](**dataset_setup) if args.arch == 'SMCN': logging.info('Set padding on UntrimmedSMCN dataset') dataset.set_padding(False) logging.info('Setting up models') models_dict = {} for i, key in enumerate(args.snapshot_tags): arch_setup = dict( visual_size=dataset.visual_size[key], lang_size=dataset.language_size, max_length=dataset.max_words, embedding_size=args.embedding_size, visual_hidden=args.visual_hidden, lang_hidden=args.lang_hidden, visual_layers=args.visual_layers, ) models_dict[key] = model.__dict__[args.arch](**arch_setup) filename = args.snapshot[i].with_suffix('.pth.tar') snapshot_ = torch.load(filename, map_location=lambda storage, loc: storage) models_dict[key].load_state_dict(snapshot_['state_dict']) models_dict[key].eval() logging.info('Creating database alas indexing corpus') engine = corpus.__dict__[args.engine](dataset, models_dict, **engine_prm) engine.indexing() logging.info('Launch evaluation...') # log-scale up to the end of the database if len(args.topk) == 1 and args.topk[0] == 0: exp = int(np.floor(np.log10(engine.num_moments))) args.topk = [10**i for i in range(0, exp + 1)] args.topk.append(engine.num_moments) num_instances_retrieved = [] judge = CorpusVideoMomentRetrievalEval(topk=args.topk) args.n_display = max(int(args.n_display * len(dataset.metadata)), 1) for it, query_metadata in tqdm(enumerate(dataset.metadata), disable=args.disable_tqdm): result_per_query = engine.query( query_metadata['language_input'], return_indices=args.dump_per_instance_results) if args.dump_per_instance_results: vid_indices, segments, proposals_ind = result_per_query else: vid_indices, segments = result_per_query judge.add_single_predicted_moment_info(query_metadata, vid_indices, segments, max_rank=engine.num_moments) num_instances_retrieved.append(len(vid_indices)) if args.disable_tqdm and (it + 1) % args.n_display == 0: logging.info(f'Processed queries [{it}/{len(dataset.metadata)}]') if args.dump_per_instance_results: # TODO: wrap-up this inside a class. We could even dump in a # non-blocking thread using a Queue if it == 0: filename = args.logfile.with_suffix('.h5') fid = h5py.File(filename, 'x') if args.reduced_dump: fid_vi = fid.create_dataset(name='vid_indices', chunks=True, shape=(len(dataset), dataset.num_videos), dtype='int64') else: fid.create_dataset(name='proposals', data=engine.proposals, chunks=True) fid_vi = fid.create_dataset(name='vid_indices', chunks=True, shape=(len(dataset), ) + vid_indices.shape, dtype='int64') fid_pi = fid.create_dataset(name='proposals_ind', chunks=True, shape=(len(dataset), ) + proposals_ind.shape, dtype='int64') if args.reduced_dump: fid_vi[it, ...] = pd.unique(vid_indices.numpy()) else: fid_vi[it, ...] = vid_indices fid_pi[it, ...] = proposals_ind if args.dump_per_instance_results: fid.close() logging.info('Summarizing results') num_instances_retrieved = np.array(num_instances_retrieved) logging.info(f'Number of queries: {len(judge.map_query)}') logging.info(f'Number of proposals: {engine.num_moments}') retrieved_proposals_median = int(np.median(num_instances_retrieved)) retrieved_proposals_min = int(num_instances_retrieved.min()) if (num_instances_retrieved != engine.num_moments).any(): logging.info('Triggered approximate search') logging.info('Median numbers of retrieved proposals: ' f'{retrieved_proposals_median:d}') logging.info('Min numbers of retrieved proposals: ' f'{retrieved_proposals_min:d}') result = judge.evaluate() _ = [logging.info(f'{k}: {v}') for k, v in result.items()] if args.dump: filename = args.logfile.with_suffix('.json') logging.info(f'Dumping results into: {filename}') with open(filename, 'x') as fid: for key, value in result.items(): result[key] = float(value) result['snapshot'] = [str(i) for i in args.snapshot] result['corpus'] = str(args.test_list) result['topk'] = args.topk result['iou_threshold'] = judge.iou_thresholds result['median_proposals_retrieved'] = retrieved_proposals_median result['min_proposals_retrieved'] = retrieved_proposals_min result['date'] = datetime.now().isoformat() result['git_hash'] = get_git_revision_hash() json.dump(result, fid, indent=1)
def run(local_rank: int, config: Any, *args: Any, **kwargs: Any): """function to be run by idist.Parallel context manager.""" # ---------------------- # make a certain seed # ---------------------- rank = idist.get_rank() manual_seed(config.seed + rank) # ----------------------- # create output folder # ----------------------- if rank == 0: now = datetime.now().strftime("%Y%m%d-%H%M%S") name = f"{config.dataset}-backend-{idist.backend()}-{now}" path = Path(config.output_dir, name) path.mkdir(parents=True, exist_ok=True) config.output_dir = path.as_posix() config.output_dir = Path(idist.broadcast(config.output_dir, src=0)) # ----------------------------- # datasets and dataloaders # ----------------------------- train_dataset, num_channels = get_datasets(config.dataset, config.data_path) train_dataloader = idist.auto_dataloader( train_dataset, batch_size=config.batch_size, num_workers=config.num_workers, {% if use_distributed_training and not use_distributed_launcher %} persistent_workers=True, {% endif %} ) # ------------------------------------------ # model, optimizer, loss function, device # ------------------------------------------ device = idist.device() netD, netG, optimizerD, optimizerG, loss_fn, lr_scheduler = initialize(config, num_channels) # ----------------------------- # trainer and evaluator # ----------------------------- ws = idist.get_world_size() real_labels = torch.ones(config.batch_size // ws, device=device) fake_labels = torch.zeros(config.batch_size // ws, device=device) fixed_noise = torch.randn(config.batch_size // ws, config.z_dim, 1, 1, device=device) trainer = create_trainers( config=config, netD=netD, netG=netG, optimizerD=optimizerD, optimizerG=optimizerG, loss_fn=loss_fn, device=device, real_labels=real_labels, fake_labels=fake_labels, ) # ------------------------------------------- # setup engines logger with python logging # print training configurations # ------------------------------------------- logger = setup_logging(config) log_basic_info(logger, config) trainer.logger = logger # ------------------------------------- # ignite handlers and ignite loggers # ------------------------------------- to_save = {'netD': netD, 'netG': netG, 'optimizerD': optimizerD, 'optimizerG': optimizerG, 'trainer': trainer} optimizers = {'optimizerD': optimizerD, 'optimizerG': optimizerG} best_model_handler, es_handler, timer_handler = get_handlers( config=config, model={'netD', netD, 'netG', netG}, trainer=trainer, evaluator=trainer, metric_name='errD', es_metric_name='errD', to_save=to_save, lr_scheduler=lr_scheduler, output_names=["errD", "errG", "D_x", "D_G_z1", "D_G_z2"], ) # setup ignite logger only on rank 0 if rank == 0: logger_handler = get_logger(config=config, trainer=trainer, optimizers=optimizers) # ----------------------------------- # resume from the saved checkpoints # ----------------------------------- if config.resume_from: resume_from(to_load=to_save, checkpoint_fp=config.resume_from) # -------------------------------------------------- # adding handlers using `trainer.on` decorator API # -------------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED) def save_fake_example(engine): fake = netG(fixed_noise) path = config.output_dir / (FAKE_IMG_FNAME.format(engine.state.epoch)) vutils.save_image(fake.detach(), path, normalize=True) # -------------------------------------------------- # adding handlers using `trainer.on` decorator API # -------------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED) def save_real_example(engine): img, y = engine.state.batch path = config.output_dir / (REAL_IMG_FNAME.format(engine.state.epoch)) vutils.save_image(img, path, normalize=True) # ------------------------------------------------------------- # adding handlers using `trainer.on` decorator API # ------------------------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED) def print_times(engine): if not timer_handler: logger.info(f"Epoch {engine.state.epoch} done. Time per batch: {timer_handler.value():.3f}[s]") timer_handler.reset() @trainer.on(Events.ITERATION_COMPLETED(every=config.log_every_iters)) @idist.one_rank_only() def print_logs(engine): fname = config.output_dir / LOGS_FNAME columns = ["iteration", ] + list(engine.state.metrics.keys()) values = [str(engine.state.iteration), ] + [str(round(value, 5)) for value in engine.state.metrics.values()] with open(fname, "a") as f: if f.tell() == 0: print("\t".join(columns), file=f) print("\t".join(values), file=f) message = f"[{engine.state.epoch}/{config.max_epochs}][{engine.state.iteration % len(train_dataloader)}/{len(train_dataloader)}]" for name, value in zip(columns, values): message += f" | {name}: {value}" # ------------------------------------------------------------- # adding handlers using `trainer.on` decorator API # ------------------------------------------------------------- @trainer.on(Events.EPOCH_COMPLETED) def create_plots(engine): try: import matplotlib as mpl mpl.use("agg") import matplotlib.pyplot as plt import pandas as pd except ImportError: warnings.warn("Loss plots will not be generated -- pandas or matplotlib not found") else: df = pd.read_csv(config.output_dir / LOGS_FNAME, delimiter="\t", index_col="iteration") _ = df.plot(subplots=True, figsize=(20, 20)) _ = plt.xlabel("Iteration number") fig = plt.gcf() path = config.output_dir / PLOT_FNAME fig.savefig(path) # -------------------------------- # print metrics to the stderr # with `add_event_handler` API # for training stats # -------------------------------- trainer.add_event_handler(Events.ITERATION_COMPLETED(every=config.log_every_iters), log_metrics, tag="train") # ------------------------------------------ # setup if done. let's run the training # ------------------------------------------ trainer.run(train_dataloader, max_epochs=config.max_epochs, epoch_length=config.train_epoch_length) # ------------------------------------------------------------ # close the logger after the training completed / terminated # ------------------------------------------------------------ if rank == 0: from ignite.contrib.handlers.wandb_logger import WandBLogger if isinstance(logger_handler, WandBLogger): # why handle differently for wandb ? # See : https://github.com/pytorch/ignite/issues/1894 logger_handler.finish() elif logger_handler: logger_handler.close() # ----------------------------------------- # where is my best and last checkpoint ? # ----------------------------------------- if best_model_handler is not None: logger.info("Last and best checkpoint: %s", best_model_handler.last_checkpoint)
]) valid_transform = transforms.Compose([ transforms.CenterCrop(args.crop_size), # transforms.RandomHorizontalFlip(), # do we need to flip when eval? transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) train_dataloader = get_loader(opt, mode='train', transform=train_transform) valid_dataloader = get_loader(opt, mode='val', transform=valid_transform) print('load the dataset into memory...') print( 'total iterations in training phase : {} \ntotal iterations in validation phase : {}' .format(len(train_dataloader), len(valid_dataloader))) trainer = Trainer(opt, train_dataloader, valid_dataloader) trainer.train() print('done') if __name__ == "__main__": args = parse_opt() setup_logging(os.path.join('log.txt')) logging.info("\nrun arguments: %s", json.dumps(vars(args), indent=4, sort_keys=True)) main(args) print('done')
def main(): utils.setup_logging() logging.info('Script Started') logging.debug('args: ' + str(sys.argv)) # if the environmental variables are not present then we are making an assumption here that the script is running # locally not in Open shift if not set(('COURSE_ID', 'CATEGORY_NAME')).issubset(environ): if len(sys.argv) is not 3: logging.error( "If running script in OpenShift you may misspelled the Environmental variables COURSE_ID/CATEGORY_NAME " "OR when running script locally command line arguments (path to properties files) are missing should be" " 'python groupsforsections.py /config.yaml /security.yaml'") sys.exit(1) config_file = sys.argv[1] security_file = sys.argv[2] logging.debug('reading the file %s ' % basename(security_file)) with open(security_file, 'r') as yml_file: sf = yaml.load(yml_file) if not sf or CONST_CANVAS not in sf: logging.error('The key \'canvas\' is missing ') sys.exit(1) logging.debug('reading the file %s ' % basename(config_file)) with open(config_file, 'r') as yml_file: cfg = yaml.load(yml_file) if not cfg or CONST_COURSE not in cfg: logging.error('The key \'course\' is missing ') sys.exit(1) if not sf[CONST_CANVAS] or CONST_TOKEN not in sf[CONST_CANVAS] or CONST_URL not in sf[CONST_CANVAS] or \ not cfg[CONST_COURSE] or CONST_ID not in cfg[CONST_COURSE] or CONST_GRP_CAT_NAME not in cfg[CONST_COURSE]: logging.error( "Some of the keys are missing from the properties files %s: %s , %s: %s" % (basename(security_file), '"canvas keys missing" ' if sf[CONST_CANVAS] is None else sf[CONST_CANVAS].keys(), basename(config_file), '"course keys missing"' if cfg[CONST_COURSE] is None else cfg[CONST_COURSE].keys())) sys.exit(1) course_id = cfg[CONST_COURSE][CONST_ID] group_category_name = cfg[CONST_COURSE][CONST_GRP_CAT_NAME] canvas_token = sf[CONST_CANVAS][CONST_TOKEN] canvas_url = sf[CONST_CANVAS][CONST_URL] else: course_id = environ['COURSE_ID'] group_category_name = environ['CATEGORY_NAME'] with open("/usr/local/secret-volume/canvas-url", 'r') as url: canvas_url = url.read() with open("/usr/local/secret-volume/canvas-token", 'r') as token: canvas_token = token.read() if not course_id or not group_category_name or not canvas_token or not canvas_url: logging.error( "some of the configurations from properties file are missing: " "course_id = " + str(course_id) + " ; group_category_name = " + str(group_category_name) + " ; canvas_url = " + str(canvas_url) + " ; canvas_token = " + (str(canvas_token) if canvas_token is None else "Not_Shown")) sys.exit(1) logging.debug('Canvas Token: ' + canvas_token) logging.info('Canvas URL: ' + canvas_url) logging.info('Course Id: ' + course_id) logging.info('Group Category Name: ' + group_category_name) # instantiating the class groups_for_section_class = GroupsForSections(canvas_token, canvas_url) # this hold the list of users that needs to be added to a group, group => users groups_to_users_dict = {} group_category_id = create_group_category(group_category_name, groups_for_section_class, course_id) if group_category_id is None: logging.error('Group category "%s" is not created for course %s ' % (group_category_name, course_id)) sys.exit(1) sections = get_sections_for_course({}, groups_for_section_class, course_id) if sections is None or not sections: logging.error( 'No sections in the course or error in getting sections for the course: ' + course_id) sys.exit(1) logging.info( 'Total # of sections that are in course %s are %d and are %s ' % (course_id, len(sections), sections.keys())) for section_id in sections: users = get_users_in_section(groups_for_section_class, [], str(section_id)) if users is None: logging.error('Could not get users in section %s(%s): ' % (section_id, sections[section_id])) sys.exit(1) logging.info('section %s (%s) has %s users : ' % (section_id, sections[section_id], str(len(users)))) # creating one group for each section in course. group_id = create_group(groups_for_section_class, str(group_category_id), sections[section_id], course_id) if group_id is None: logging.error('Could not create group for section %s(%s): ' % (section_id, sections[section_id])) sys.exit(1) logging.info( 'The Group id %s created for the Section %s with name %s' % (str(group_id), section_id, sections[section_id])) # mapping all the users in a sections to corresponding group groups_to_users_dict[group_id] = users failed_groups_to_users_dict = defaultdict(list) success_groups_to_users_dict = defaultdict(list) # adding users to the group for group, users in groups_to_users_dict.items(): for user in users: membership_id = add_users_to_group(groups_for_section_class, group, user) if membership_id is None: logging.error('The user %s is not added to the group %s' % (user, group)) failed_groups_to_users_dict[group].append(user) else: success_groups_to_users_dict[group].append(user) logging.info( 'The User %s got added to the Group %s with membership id %s ' % (user, group, str(membership_id))) # logging total users that belongs to corresponding group logging.info("**** Total Users List in a Group set: ") for group in groups_to_users_dict: logging.info('%d users should be added to the group %s' % (len(groups_to_users_dict[group]), group)) # logging the total successful users added to the each group if success_groups_to_users_dict: logging.info("**** Successful Addition of Users to Groups: ") for group in success_groups_to_users_dict: logging.info('%d users successfully added to the group %s' % (len(success_groups_to_users_dict[group]), group)) # logging the users list that was not added to a group if failed_groups_to_users_dict: logging.error("**** Failed Addition of Users to Groups: ") for group in failed_groups_to_users_dict: users = ','.join(failed_groups_to_users_dict[group]) logging.info( '%d users are not added in the group %s and they are %s ' % (len(failed_groups_to_users_dict[group]), group, users)) logging.info('script ran successfully')
import os import sys import antlr3 import inspect, importlib from PyFuncLexer import PyFuncLexer from PyFuncParser import PyFuncParser from splunk import Intersplunk as si import utils logger = utils.setup_logging("pyfunc") def parse_func(pfunc): char_stream = antlr3.ANTLRStringStream(pfunc) lexer = PyFuncLexer(char_stream) tokens = antlr3.CommonTokenStream(lexer) tokens.fillBuffer() parser = PyFuncParser(tokens) return parser.pyfunc() def find_func(func): pkg = '.'.join(func.packages) module = importlib.import_module(pkg) members = inspect.getmembers(module) flist = [f for n, f in members if inspect.isfunction(f) and n == func.name] if len(flist) >= 1: return flist[0]
def run_experiment(args): import os # set environment variables for tensorflow os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' import inspect import shutil import numpy as np import tensorflow as tf from collections import OrderedDict import matplotlib.pyplot as plt plt.switch_backend('Agg') import utils import paramgraphics import nn from tensorflow.contrib.framework.python.ops import arg_scope # import tensorflow.contrib.layers as layers # ---------------------------------------------------------------- # Arguments and Settings args.message = 'LBT-GAN-celebA_' + args.message np.random.seed(args.seed) tf.set_random_seed(args.seed) # copy file for reproducibility logger, dirname = utils.setup_logging(args) script_fn = inspect.getfile(inspect.currentframe()) script_src = os.path.abspath(script_fn) script_dst = os.path.abspath(os.path.join(dirname, script_fn)) shutil.copyfile(script_src, script_dst) logger.info("script copied from %s to %s" % (script_src, script_dst)) # print arguments for k, v in sorted(vars(args).items()): logger.info(" %20s: %s" % (k, v)) # get arguments batch_size = args.batch_size batch_size_est = args.batch_size_est gen_lr = args.gen_lr dis_lr = args.dis_lr est_lr = args.est_lr lambda_gan = args.lambda_gan beta1 = 0.5 epsilon = 1e-8 max_iter = args.max_iter viz_every = args.viz_every z_dim, vae_z_dim = utils.get_ints(args.z_dims) unrolling_steps = args.unrolling_steps assert unrolling_steps > 0 n_viz = args.n_viz # ---------------------------------------------------------------- # Dataset from dataset import load_celebA, DataSet train_x, test_x = load_celebA() train_x = train_x * 2. - 1. test_x = test_x * 2. - 1. dtrain = DataSet(train_x, None) dtest = DataSet(test_x, None) # data_channel = 3 x_dim = 64 * 64 * 3 dim_input = (64, 64) # ---------------------------------------------------------------- # Model setup logger.info("Setting up model ...") def discriminator(x, Reuse=tf.AUTO_REUSE, is_training=True): def leaky_relu(x, alpha=0.2): return tf.maximum(alpha * x, x) with tf.variable_scope("discriminator", reuse=Reuse): x = tf.reshape(x, [batch_size, 64, 64, 3]) lx = tf.layers.dropout(x, 0.2, training=is_training) conv1 = tf.layers.conv2d( lx, 64, 5, 2, use_bias=True, padding='same') conv1 = leaky_relu(conv1) conv2 = tf.layers.conv2d( conv1, 128, 5, 2, use_bias=False, padding='same') conv2 = tf.layers.batch_normalization(conv2, training=is_training) conv2 = leaky_relu(conv2) conv3 = tf.layers.conv2d( conv2, 256, 5, 2, use_bias=False, padding='same') conv3 = tf.layers.batch_normalization(conv3, training=is_training) conv3 = leaky_relu(conv3) conv4 = tf.layers.conv2d( conv3, 512, 5, 2, use_bias=False, padding='same') conv4 = tf.layers.batch_normalization(conv4, training=is_training) conv4 = leaky_relu(conv4) conv4 = tf.layers.flatten(conv4) fc2 = tf.layers.dense(conv4, 1) return fc2 def generator(z, Reuse=tf.AUTO_REUSE, flatten=True, is_training=True): if args.g_nonlin == 'relu': # print("Use Relu in G") nonlin = tf.nn.relu else: # print("Use tanh in G") nonlin = tf.nn.tanh # nonlin = tf.nn.relu if args.g_nonlin == 'relu' else tf.nn.tanh # norm_prms = {'is_training': is_training, 'decay': 0.9, 'scale': False} with tf.variable_scope("generator", reuse=Reuse): lx = tf.layers.dense(z, 4 * 4 * 512) lx = tf.reshape(lx, [-1, 4, 4, 512]) lx = tf.layers.batch_normalization(lx, training=is_training) lx = nonlin(lx) lx = tf.layers.conv2d_transpose( lx, 256, 5, 2, use_bias=False, padding='same') lx = tf.layers.batch_normalization(lx, training=is_training) lx = nonlin(lx) lx = tf.layers.conv2d_transpose( lx, 128, 5, 2, use_bias=False, padding='same') lx = tf.layers.batch_normalization(lx, training=is_training) lx = nonlin(lx) lx = tf.layers.conv2d_transpose( lx, 64, 5, 2, use_bias=False, padding='same') lx = tf.layers.batch_normalization(lx, training=is_training) lx = nonlin(lx) lx = tf.layers.conv2d_transpose(lx, 3, 5, 2, padding='same') lx = tf.nn.tanh(lx) if flatten is True: lx = tf.layers.flatten(lx) return lx nonlin = tf.nn.relu def compute_est_samples(z, params=None, reuse=tf.AUTO_REUSE): with tf.variable_scope("estimator"): with arg_scope([nn.dense], params=params): with tf.variable_scope("decoder", reuse=reuse): h_dec_1 = nn.dense( z, vae_z_dim, 200 * 2, "dense1", nonlinearity=nonlin) h_dec_2 = nn.dense( h_dec_1, 200 * 2, 500 * 2, "dense2", nonlinearity=nonlin) x_mean = nn.dense( h_dec_2, 500 * 2, x_dim, "dense3", nonlinearity=None) x_mean = tf.nn.tanh(x_mean) return x_mean def compute_est_ll(x, params=None, reuse=tf.AUTO_REUSE): with tf.variable_scope("estimator", reuse=reuse): logvae_x_var = tf.get_variable( "logvae_x_var", (), tf.float32, trainable=True, initializer=tf.constant_initializer(-1)) with arg_scope([nn.dense], params=params): with tf.variable_scope("encoder", reuse=reuse): h_enc_1 = nn.dense( x, x_dim, 500 * 2, "dense1", nonlinearity=nonlin) # h_enc_1 = nn.batch_norm(h_enc_1, "bn1", 129, 2) h_enc_2 = nn.dense( h_enc_1, 500 * 2, 200 * 2, "dense2", nonlinearity=nonlin) # h_enc_2 = nn.batch_norm(h_enc_2, "bn2", 128, 2) z_mean = nn.dense( h_enc_2, 200 * 2, vae_z_dim, "dense3", nonlinearity=None) z_logvar = nn.dense( h_enc_2, 200 * 2, vae_z_dim, "dense4", nonlinearity=None) epsilon = tf.random_normal(tf.shape(z_mean), dtype=tf.float32) z = z_mean + tf.exp(0.5 * z_logvar) * epsilon with tf.variable_scope("decoder", reuse=reuse): h_dec_1 = nn.dense( z, vae_z_dim, 200 * 2, "dense1", nonlinearity=nonlin) # h_dec_1 = nn.batch_norm(h_dec_1, "bn1", 127, 2) h_dec_2 = nn.dense( h_dec_1, 200 * 2, 500 * 2, "dense2", nonlinearity=nonlin) # h_dec_2 = nn.batch_norm(h_dec_2, "bn2", 128, 2) x_mean = nn.dense( h_dec_2, 500 * 2, x_dim, "dense3", nonlinearity=None) x_mean = tf.nn.tanh(x_mean) vae_x_var = tf.exp(logvae_x_var) elbo = tf.reduce_mean( tf.reduce_sum( -0.5 * np.log(2 * np.pi) - 0.5 * tf.log(vae_x_var) - tf.layers.flatten(tf.square(x - x_mean)) / (2 * vae_x_var), axis=1) - tf.reduce_sum( -0.5 * (1 + z_logvar - tf.square(z_mean) - tf.exp(z_logvar)), axis=1)) return elbo, tf.nn.tanh(x_mean) def compute_est_updated_with_SGD(x, lr=0.001, params=None): elbo, _ = compute_est_ll(x, params=params) grads = tf.gradients(elbo, params.values()) new_params = params.copy() for key, g in zip(params, grads): new_params[key] += lr * g return elbo, new_params def compute_est_updated_with_Adam(x, lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, decay=0., params=None, adam_params=None): elbo, _ = compute_est_ll(x, params=params) grads = tf.gradients(elbo, params.values()) new_params = params.copy() new_adam_params = adam_params.copy() new_adam_params['iterations'] += 1 lr = lr * \ (1. / (1. + decay * tf.cast(adam_params['iterations'], tf.float32))) t = tf.cast(new_adam_params['iterations'], tf.float32) lr_t = lr * (tf.sqrt(1. - tf.pow(beta_2, t)) / (1. - tf.pow(beta_1, t))) for key, g in zip(params, grads): new_adam_params['m_' + key] = ( beta_1 * adam_params['m_' + key]) + (1. - beta_1) * g new_adam_params['v_' + key] = tf.stop_gradient( (beta_2 * adam_params['v_' + key]) + (1. - beta_2) * tf.square(g)) new_params[ key] = params[key] + lr_t * new_adam_params['m_' + key] / tf.sqrt( new_adam_params['v_' + key] + epsilon) return elbo, new_params, new_adam_params lr = tf.placeholder(tf.float32) data = tf.placeholder(tf.float32, shape=(batch_size, x_dim)) # Construct generator and estimator nets est_params_dict = OrderedDict() _, _ = compute_est_ll(data, params=est_params_dict) gen_noise = tf.random_normal((batch_size_est, z_dim), dtype=tf.float32) samples_gen = generator(gen_noise) vae_noise = tf.random_normal((batch_size_est, vae_z_dim), dtype=tf.float32) samples_est = tf.nn.sigmoid( compute_est_samples(z=vae_noise, params=est_params_dict)) # for key in est_params_dict: # print(key, est_params_dict[key]) adam_params_dict = OrderedDict() with tf.variable_scope("adam"): adam_params_dict['iterations'] = tf.Variable( 0, dtype=tf.int64, name='iterations') for key in est_params_dict: adam_params_dict['m_' + key] = tf.Variable( tf.zeros_like(est_params_dict[key]), name='m_' + key) adam_params_dict['v_' + key] = tf.Variable( tf.zeros_like(est_params_dict[key]), name='v_' + key) gen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "generator") est_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "estimator") adam_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "adam") # unrolling estimator updates cur_params = est_params_dict cur_adam_params = adam_params_dict elbo_genx_at_steps = [] for _ in range(unrolling_steps): samples_gen = generator( tf.random_normal((batch_size_est, z_dim), dtype=tf.float32)) elbo_genx_step, cur_params, cur_adam_params = compute_est_updated_with_Adam( samples_gen, lr=lr, beta_1=beta1, epsilon=epsilon, params=cur_params, adam_params=cur_adam_params) elbo_genx_at_steps.append(elbo_genx_step) # estimator update updates = [] for key in est_params_dict: updates.append(tf.assign(est_params_dict[key], cur_params[key])) for key in adam_params_dict: updates.append(tf.assign(adam_params_dict[key], cur_adam_params[key])) e_train_op = tf.group(*updates, name="e_train_op") # Optimize the generator on the unrolled ELBO loss unrolled_elbo_data, _ = compute_est_ll(data, params=cur_params) # unrolled_elbo_samp, _ = compute_est_ll( # tf.stop_gradient(samples_gen), params=cur_params) # GAN-loss for discriminator and generator samples_gen_gan = generator( tf.random_normal((batch_size_est, z_dim), dtype=tf.float32)) fake_D_output = discriminator(samples_gen_gan) real_D_output = discriminator(data) # print(fake_D_output, real_D_output) ganloss_g = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(fake_D_output), logits=fake_D_output)) ganloss_D_fake = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.zeros_like(fake_D_output), logits=fake_D_output)) ganloss_D_real = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(real_D_output), logits=real_D_output)) use_e_sym = tf.placeholder(tf.float32, shape=(), name="use_E") if args.lbt: logger.info("Using lbt") object_g = lambda_gan * ganloss_g - use_e_sym * unrolled_elbo_data else: logger.info("Using GAN") object_g = lambda_gan * ganloss_g # - use_e_sym * unrolled_elbo_data # object_g = -1 * unrolled_elbo_data object_d = ganloss_D_fake + ganloss_D_real dis_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "discriminator") g_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, "generator") g_train_opt = tf.train.AdamOptimizer( learning_rate=gen_lr, beta1=beta1, epsilon=epsilon) # g_train_opt = tf.train.RMSPropOptimizer(learning_rate=gen_lr, epsilon=epsilon) g_grads = g_train_opt.compute_gradients(object_g, var_list=gen_vars) # g_grads_clipped = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in g_grads] g_grads_, g_vars_ = zip(*g_grads) g_grads_clipped_, g_grads_norm_ = tf.clip_by_global_norm(g_grads_, 5.) g_grads_clipped = zip(g_grads_clipped_, g_vars_) if args.clip_grad: logger.info("Clipping gradients of generator parameters.") with tf.control_dependencies(g_update_ops): g_train_op = g_train_opt.apply_gradients(g_grads_clipped) else: with tf.control_dependencies(g_update_ops): g_train_op = g_train_opt.apply_gradients(g_grads) # g_train_op = g_train_opt.apply_gradients(g_grads) d_train_opt = tf.train.AdamOptimizer( learning_rate=dis_lr, beta1=beta1, epsilon=epsilon) d_train_op = d_train_opt.minimize(object_d, var_list=dis_vars) # ---------------------------------------------------------------- # Training sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=None) if args.model_path: saver.restore(sess, args.model_path) # # print variables # logger.info("Generator parameters:") # for p in gen_vars: # logger.debug("%s: %s" % (p.name, sess.run(tf.shape(p)))) # logger.info("Estimator parameters:") # for p in est_vars: # logger.debug("%s: %s" % (p.name, sess.run(tf.shape(p)))) # logger.info("Adam parameters:") # for p in adam_vars: # logger.debug("%s: %s" % (p.name, sess.run(tf.shape(p)))) elbo_vals = [] ganloss_vals = [] tgan_g, tgan_d_fake, tgan_d_real = 0., 0., 0. elbo_genx_val, elbo_data_val, gradients_nrom = -np.inf, -np.inf, 0 use_e_flag = 0. for i in range(max_iter + 1): x_mini_batch = dtrain.next_batch(batch_size)[0].reshape( [batch_size, x_dim]) if i > 3000: use_e_flag = 1. for _ in range(args.n_est): elbo_genx_val, _ = sess.run( [elbo_genx_at_steps[-1], e_train_op], feed_dict={lr: 3. * est_lr}) for _ in range(args.n_dis): _, tgan_g, tgan_d_real, tgan_d_fake = sess.run( [d_train_op, ganloss_g, ganloss_D_real, ganloss_D_fake], feed_dict={data: x_mini_batch}) elbo_data_val, gradients_nrom, _ = sess.run( [unrolled_elbo_data, g_grads_norm_, g_train_op], feed_dict={ data: x_mini_batch, lr: est_lr, use_e_sym: use_e_flag }) elbo_vals.append([elbo_genx_val, elbo_data_val]) ganloss_vals.append([tgan_g, tgan_d_real, tgan_d_fake]) # visualization if i % viz_every == 0: np_samples_gen, np_samples_est, np_data = sess.run( [samples_gen, samples_est, data], feed_dict={data: x_mini_batch}) np_samples_est = np_samples_est.reshape([-1, 64, 64, 3]).transpose( [0, 3, 1, 2]).reshape([-1, 64 * 64 * 3]) np_samples_gen = np_samples_gen.reshape([-1, 64, 64, 3]).transpose( [0, 3, 1, 2]).reshape([-1, 64 * 64 * 3]) np_data = np_data.reshape([-1, 64, 64, 3]).transpose( [0, 3, 1, 2]).reshape([-1, 64 * 64 * 3]) np_samples_est = np_samples_est / 2. + 0.5 np_samples_gen = np_samples_gen / 2. + 0.5 np_data = np_data / 2. + 0.5 paramgraphics.mat_to_img( np_samples_gen[:n_viz], dim_input, colorImg=True, save_path=os.path.join(dirname, 'sample_' + str(i) + '_gen.png')) paramgraphics.mat_to_img( np_data[:n_viz], dim_input, colorImg=True, save_path=os.path.join(dirname, 'sample_' + str(i) + '_dat.png')) paramgraphics.mat_to_img( np_samples_est[:n_viz], dim_input, colorImg=True, save_path=os.path.join(dirname, 'sample_' + str(i) + '_est.png')) fig = plt.figure(figsize=(6, 4)) plt.plot( elbo_vals, '.', markersize=2, markeredgecolor='none', linestyle='none', alpha=min(1.0, 0.01 * max_iter / (i + 1))) plt.ylim((-200.0, 0.0)) legend = plt.legend(('elbo_genx', 'elbo_data'), markerscale=6) for lh in legend.legendHandles: lh._legmarker.set_alpha(1.) plt.grid(True) plt.tight_layout() plt.savefig(os.path.join(dirname, 'curve.png'), bbox_inches='tight') plt.close(fig) # training log if i % viz_every == 0: elbo_genx_ma_val, elbo_data_ma_val = np.mean( elbo_vals[-200:], axis=0) logger.info( "Iter %d: gradients norm = %.4f. samples LL = %.4f, data LL = %.4f." % (i, gradients_nrom, elbo_genx_ma_val, elbo_data_ma_val)) logger.info( "Iter %d: gan_g = %.4f. gan_d_real = %.4f, gan_d_fake = %.4f." % (i, tgan_g, tgan_d_real, tgan_d_fake)) if i % args.model_every == 0: saver.save(sess, os.path.join(dirname, 'model_' + str(i)))
def main(): stats_start_time = time.time() # setup logger logger = utils.setup_logging() logger.info("logger set up") # accept input search_string = raw_input("Enter search keyword ") try: crawl_limit = int(raw_input("Enter maximum number of pages to crawl ")) except ValueError: logger.error("number of pages in not an integer") return if crawl_limit < 11: logger.error("no crawling required") return logger.info("starting search for %s by crawling %d pages", search_string, crawl_limit) # fetch initial pages while True: logger.info("fetching initial seed links for :: %s", search_string) initial_urls = utils.fetch_seed(search_string) logger.info("%d initial seed links fetched", len(initial_urls)) if len(initial_urls) > 0: break # setup initial data # page_heap --> used to store type page which contains url, promise, depth # page_heap --> ordered by promise, largest promise on top page_heap = [] # relevance is used to store relevance of crawled urls # url--> relevance relevance = {} # mapping to store incoming links from other urls # url -> [url1, url2...url_n] # this is mapped as an inverted graph. # eg: url1 has incoming links from [url2, url3] links = {} # pages_crawled, stats_errors, relevant_count are used to track crawler stats pages_crawled = 0 stats_errors = 0 relevant_count = 0 black_list = ["php", "pdf", "jpg", "png", "mailto", "comment", "advertising", "javascript", "cite", "cite_note", "picture", "image", "photo", "#", ".mp3", ".mp4"] # output file output_file = open("crawler.txt", "w"); # push initial seed urls to heap for url in initial_urls: if FOCUSSED_CRAWL: heapq.heappush(page_heap, page.Page(url, 10, 0)) else: page_heap.append(page.Page(url, 10, 0)) links[url] = ["www.google.com"] # setup loop to crawl the web # Flow: # 1. Pop page off the heap # 2. Fetch page # 3. Compute & store relevance # 4. If page was too deep, don't dig page for links # 5. Find all links in the page # 6. For all link # 1. if we are seeing the url for the first time add to heap # 2. If we are seeing the url before, update promise in heap # 7. Repeat while pages_crawled < crawl_limit and len(page_heap) > 0: if FOCUSSED_CRAWL: next_page_to_crawl = heapq.heappop(page_heap) else: next_page_to_crawl = page_heap.pop(0) next_page_url = next_page_to_crawl.url try: if not utils.can_crawl(next_page_url): logger.info("not allowed to crawl %s", next_page_url) del links[next_page_url] continue except IOError: logger.error("error connecting to %s", next_page_url) continue try: logger.info("trying to fetch page :: %s", next_page_url) next_page = requests.get(next_page_url, timeout=1) except requests.exceptions.RequestException: logger.error("exception fetching page :: %s", next_page_url) stats_errors = stats_errors+1 continue if next_page.status_code != 200: logger.error("error fetching page :: %s", next_page.status_code) stats_errors = stats_errors+1 continue pages_crawled = pages_crawled + 1 page_relevance = utils.compute_relevance(next_page.text, search_string) # scale cosine threshold to 0-100 if page_relevance > COSINE_RELEVANCE_THRESHOLD*100: relevant_count = relevant_count + 1 # write coutput to file output = str(pages_crawled)+" "+next_page_url+"\n" output_string = " time: "+str(datetime.datetime.time(datetime.datetime.now())) +\ " size:"+str(len(next_page.content))+" relevance:"+str(page_relevance) if FOCUSSED_CRAWL: output_string = output_string+" promise:"+str(next_page_to_crawl.promise)+"\n\n" else: output_string = output_string + "\n\n" output_file.write(output) output_file.write(output_string) output_file.flush() relevance[next_page_url] = page_relevance old_domain = urlparse(next_page_url).netloc links_on_page = utils.get_links_on_page(next_page_url, next_page.text) for url in links_on_page: # check if url has already been visited if url in relevance: logger.info("ignoring already visited url :: %s", url) continue # check if url is blacklisted if utils.is_blacklisted_url(black_list, url): logger.info("ignoring blacklisted url :: %s", url) continue # check if page is soon to be visited (present in page_heap) if page.Page(url, 0, 0) in page_heap: # update url promise if we are in focussed mode only # no need to update promise in bfs if FOCUSSED_CRAWL: logger.info("new pointer to %s , updating promise", url) utils.update_url_promise(url, next_page_url, relevance, links, page_heap, crawl_limit) continue # At this point, we know we are seeing the page for the first time # add page to heap, create first link for page logger.info("new link %s found, adding to page_heap", url) # check if we are crawling too deep into a domain new_domain = urlparse(url).netloc depth = 0 if new_domain == old_domain: depth = next_page_to_crawl.depth + 1 if depth >= MAX_DEPTH_TO_CRAWL: continue # compute predicted promise predicted_promise = utils.compute_promise(next_page_url, url, relevance, search_string) new_page = page.Page(url, predicted_promise, depth) if FOCUSSED_CRAWL: heapq.heappush(page_heap, new_page) else: page_heap.append(new_page) links[url] = [next_page_url] # an optimization to ensure heapify operation stays O(log(crawl_limit) if len(page_heap) > crawl_limit: logger.info("trimming heap") del page_heap[math.ceil(crawl_limit * 0.8):] # delete incoming links to a page for 'search in links' optimization # we will not be using this data again as we don't visit seen urls again try: del links[next_page_url] except Exception: logger.error("error removing graph links to :: %s", next_page_url) # log stats to file output_file.write("\n~~~~~~~~~~~~~~~~~~~Stats~~~~~~~~~~~~~~~~\n\n") harvest_percentage = str(100*float(relevant_count)/float(crawl_limit)) output_file.write("harvest rate : "+harvest_percentage+" percent\n") output_file.write("4xx errors : "+str(stats_errors)+"\n") output_file.write("execution time : "+str((time.time()-stats_start_time)/60)+" minutes\n") output_file.write("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") output_file.flush() output_file.close()
def main(cfg): device = torch.device('cuda' if cfg.cuda else 'cpu') autoenc = DeepLabv3Plus() model = Siamese(autoenc, in_channels=3, n_edges=cfg.n_edges, sp_pool_use_max=cfg.sp_pooling_max) if (cfg.checkpoint_autoenc is not None): print('loading checkpoint {}'.format(cfg.checkpoint_autoenc)) state_dict = torch.load(cfg.checkpoint_autoenc, map_location=lambda storage, loc: storage) autoenc.load_state_dict(state_dict) elif (cfg.checkpoint_siam is not None): print('loading checkpoint {}'.format(cfg.checkpoint_siam)) state_dict = torch.load(cfg.checkpoint_siam, map_location=lambda storage, loc: storage) model.load_state_dict(state_dict) autoenc.to(device) model.to(device) transf = iaa.Sequential([ iaa.Invert(0.5) if 'Dataset1' in 'Dataset' + cfg.train_dir else iaa.Noop(), iaa.SomeOf(3, [ iaa.Affine(scale={ "x": (1 - cfg.aug_scale, 1 + cfg.aug_scale), "y": (1 - cfg.aug_scale, 1 + cfg.aug_scale) }, rotate=(-cfg.aug_rotate, cfg.aug_rotate), shear=(-cfg.aug_shear, cfg.aug_shear)), iaa.SomeOf(1, [ iaa.AdditiveGaussianNoise(scale=cfg.aug_noise * 255), iaa.GaussianBlur(sigma=(0., cfg.aug_blur)), iaa.GammaContrast((0., cfg.aug_gamma)) ]), iaa.Fliplr(p=0.5), iaa.Flipud(p=0.5) ]), rescale_augmenter ]) transf_normal = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) dl_train = Loader(pjoin(cfg.in_root, 'Dataset' + cfg.train_dir), augmentation=transf, n_segments=cfg.n_segments_train, delta_segments=cfg.delta_segments_train, normalization=transf_normal) dl_test = torch.utils.data.ConcatDataset([ Loader(pjoin(cfg.in_root, 'Dataset' + d), augmentation=transf, n_segments=cfg.n_segments_test, delta_segments=cfg.delta_segments_test, normalization=transf_normal) for d in cfg.test_dirs ]) dataloader_train = DataLoader(dl_train, batch_size=cfg.batch_size, sampler=SubsetRandomSampler( cfg.n_frames_epoch * cfg.train_frames), collate_fn=dl_train.collate_fn, drop_last=True, num_workers=cfg.n_workers) dataloader_test = DataLoader(dl_test, batch_size=cfg.batch_size, collate_fn=dl_train.collate_fn, sampler=torch.utils.data.RandomSampler( dl_test, replacement=True, num_samples=cfg.batch_size), num_workers=cfg.n_workers) dataloaders = {'train': dataloader_train, 'test': dataloader_test} d = datetime.datetime.now() ds_dir = os.path.split('Dataset' + cfg.train_dir)[-1] run_dir = pjoin(cfg.out_dir, '{}_{:%Y-%m-%d_%H-%M}_{}'.format(ds_dir, d, cfg.exp_name)) if (not os.path.exists(run_dir)): os.makedirs(run_dir) # Save cfg with open(pjoin(run_dir, 'cfg.yml'), 'w') as outfile: yaml.dump(cfg.__dict__, stream=outfile, default_flow_style=False) # convert batch to device batch_to_device = lambda batch: { k: v.to(device) if (isinstance(v, torch.Tensor)) else v for k, v in batch.items() } optimizer = optim.SGD(params=[{ 'params': model.autoenc.encoder.parameters(), 'lr': cfg.lr_autoenc }, { 'params': model.autoenc.aspp.parameters(), 'lr': cfg.lr_autoenc }, { 'params': model.autoenc.decoder.parameters(), 'lr': cfg.lr_siam }, { 'params': model.linear1.parameters(), 'lr': cfg.lr_siam }, { 'params': model.linear2.parameters(), 'lr': cfg.lr_siam }], momentum=cfg.momentum, weight_decay=cfg.decay) utls.setup_logging(run_dir) logger = logging.getLogger('siam') logger.info('run_dir: {}'.format(run_dir)) train(cfg, model, dataloaders, run_dir, batch_to_device, optimizer, logger) logger.info('training siam')
import utils log = utils.setup_logging(__name__) class Watson: def __init__(self, wml_client, wos_client): self.wml_client = wml_client self.wos_client = wos_client def get_service_provider_by_name(self, service_provider_name): service_providers = self.wos_client.service_providers.list( ).result.service_providers log.debug("Service providers size: " + str(len(service_providers))) service_provider_id = None for service_provider in service_providers: if service_provider.entity.name == service_provider_name: service_provider_id = service_provider.metadata.id log.debug("Found the service_provider: {}".format( service_provider_id)) return service_provider_id
def run_experiment(args): import os # set environment variables for theano os.environ['THEANO_FLAGS'] = "lib.cnmem=" + str(args.mem) + ",device=gpu" + str(args.gpu) import threading import Queue import inspect import shutil import time import logging import six import collections import itertools import random import numpy as np import scipy import theano import theano.tensor as T import lasagne import lasagne.layers as ll import lasagne.nonlinearities as ln import parmesan import layers import utils import cfdataset #---------------------------------------------------------------- # Arguments and Settings floatX = theano.config.floatX logger = logging.getLogger() np.random.seed(args.seed) # copy file for reproducibility dirname = utils.setup_logging(args.message, args.loglv) script_src = os.path.abspath(inspect.getfile(inspect.currentframe())) script_dst = os.path.join(dirname, os.path.split(script_src)[1]) shutil.copyfile(script_src, script_dst) # print arguments args_dict = collections.OrderedDict(sorted(vars(args).items())) for k, v in six.iteritems(args_dict): logger.info(" %20s: %s" % (k, v)) # get arguments D_u, D_v = args.D_u, args.D_v lr = args.lr weight_decay = args.weight_decay lookahead = args.lookahead max_epoch = args.max_epoch batch_size_u, batch_size_v = args.batch_size_u, args.batch_size_v nonlin_enc = layers.get_nonlin(args.nonlin_enc) nonlin_dec = layers.get_nonlin(args.nonlin_dec) negative_ratio = args.negative_ratio #---------------------------------------------------------------- # Dataset dataset = cfdataset.CF_implicit_data(name=args.dataset) N_u, N_v = dataset.N_users, dataset.N_items T_matrix = dataset.T_matrix.astype(floatX) R_matrix = dataset.R_matrix.astype(floatX) R_negative_matrix = 1 - R_matrix assert np.all(R_matrix == (T_matrix > 0.5)) assert np.all((R_negative_matrix == 1) == (T_matrix == 0)) R_test = dataset.R_latest T_matrix[np.arange(N_u), R_test] = 0 R_matrix[np.arange(N_u), R_test] = 0 assert np.all(R_matrix == (T_matrix > 0.5)) R_matrix_for_test = R_matrix.copy() R_valid = dataset.R_2nd_latest T_matrix[np.arange(N_u), R_valid] = 0 R_matrix[np.arange(N_u), R_valid] = 0 assert np.all(R_matrix == (T_matrix > 0.5)) N_interaction = dataset.N_interaction - N_u * 2 assert np.all(R_valid != R_test) assert np.all(R_matrix_for_test[np.arange(N_u), R_valid] == 1) assert np.all(R_matrix_for_test[np.arange(N_u), R_test] == 0) assert np.all(R_matrix[np.arange(N_u), R_valid] == 0) assert np.all(R_matrix[np.arange(N_u), R_test] == 0) assert np.all(T_matrix[np.arange(N_u), R_valid] == 0) assert np.all(T_matrix[np.arange(N_u), R_test] == 0) assert N_interaction == np.count_nonzero(R_matrix) assert N_interaction + N_u == np.count_nonzero(R_matrix_for_test) logger.info("%d users, %d items, %d training interactions (%d total, 2 * %d held out for validation and test)." % (N_u, N_v, N_interaction, dataset.N_interaction, N_u)) #---------------------------------------------------------------- # numpy variables # encoded vectors np_enc_u_h = np.zeros((N_u, D_u), dtype=floatX) np_enc_v_h = np.zeros((N_v, D_v), dtype=floatX) #---------------------------------------------------------------- # Symbolic variables sym_lr = T.fscalar('lr') sym_Ru_pos = T.fmatrix('Ru_pos') sym_dr_Ru_pos = T.fscalar('dr_Ru_pos') sym_uid_origin_pos = T.ivector('uid_origin_pos') sym_uid_minibatch_pos = T.ivector('uid_minibatch_pos') sym_Ru_neg = T.fmatrix('Ru_neg') sym_dr_Ru_neg = T.fscalar('dr_Ru_neg') sym_uid_origin_neg = T.ivector('uid_origin_neg') sym_uid_minibatch_neg = T.ivector('uid_minibatch_neg') sym_Rv = T.fmatrix('Rv') sym_dr_Rv = T.fscalar('dr_Rv') sym_vid_origin_pos = T.ivector('vid_origin_pos') sym_vid_minibatch_pos = T.ivector('vid_minibatch_pos') sym_vid_origin_neg = T.ivector('vid_origin_neg') sym_vid_minibatch_neg = T.ivector('vid_minibatch_neg') sym_R_minibatch = T.fvector('R_minibatch') #---------------------------------------------------------------- # Model setup (training model) logger.info("Setting up model ...") # Input layers l_in_Ru_pos = ll.InputLayer((None, N_v), input_var=sym_Ru_pos, name='l_in_Ru_pos') l_in_uid_origin_pos = ll.InputLayer((None,), input_var=sym_uid_origin_pos, name='l_in_uid_origin_pos') l_in_uid_minibatch_pos = ll.InputLayer((None,), input_var=sym_uid_minibatch_pos, name='l_in_uid_minibatch_pos') l_in_Ru_neg = ll.InputLayer((None, N_v), input_var=sym_Ru_neg, name='l_in_Ru_neg') l_in_uid_origin_neg = ll.InputLayer((None,), input_var=sym_uid_origin_neg, name='l_in_uid_origin_neg') l_in_uid_minibatch_neg = ll.InputLayer((None,), input_var=sym_uid_minibatch_neg, name='l_in_uid_minibatch_neg') l_in_Rv = ll.InputLayer((None, N_u), input_var=sym_Rv, name='l_in_Rv') l_in_vid_origin_pos = ll.InputLayer((None,), input_var=sym_vid_origin_pos, name='l_in_vid_origin_pos') l_in_vid_minibatch_pos = ll.InputLayer((None,), input_var=sym_vid_minibatch_pos, name='l_in_vid_minibatch_pos') l_in_vid_origin_neg = ll.InputLayer((None,), input_var=sym_vid_origin_neg, name='l_in_vid_origin_neg') l_in_vid_minibatch_neg = ll.InputLayer((None,), input_var=sym_vid_minibatch_neg, name='l_in_vid_minibatch_neg') # Dropout layers l_in_Ru_pos = ll.DropoutLayer(l_in_Ru_pos, p=sym_dr_Ru_pos, rescale=False, name='Dropout-l_in_Ru_pos') l_in_Ru_neg = ll.DropoutLayer(l_in_Ru_neg, p=sym_dr_Ru_neg, rescale=False, name='Dropout-l_in_Ru_neg') l_in_Rv = ll.DropoutLayer(l_in_Rv, p=sym_dr_Rv, rescale=False, name='Dropout-l_in_Rv') # User encoder model h(Ru) l_enc_u_h_pos = ll.DenseLayer(l_in_Ru_pos, num_units=D_u, nonlinearity=nonlin_enc, name='l_enc_u_h_pos') l_enc_u_h_neg = ll.DenseLayer(l_in_Ru_neg, num_units=D_u, nonlinearity=nonlin_enc, W=l_enc_u_h_pos.W, b=l_enc_u_h_pos.b, name='l_enc_u_h_neg') # Item encoder model h(Rv) l_enc_v_h = ll.DenseLayer(l_in_Rv, num_units=D_v, nonlinearity=nonlin_enc, name='l_enc_v_h') # User decoder model s(h(Ru)) l_dec_u_s_pos = layers.SimpleDecodeLayer([l_enc_u_h_pos, l_in_vid_origin_pos, l_in_uid_minibatch_pos], num_units=N_v, nonlinearity=None, name='l_dec_u_s_pos') l_dec_u_s_neg = layers.SimpleDecodeLayer([l_enc_u_h_neg, l_in_vid_origin_neg, l_in_uid_minibatch_neg], num_units=N_v, V=l_dec_u_s_pos.V, Q=l_dec_u_s_pos.Q, b=l_dec_u_s_pos.b, nonlinearity=None, name='l_dec_u_s_neg') l_dec_u_s_all = ll.ConcatLayer([l_dec_u_s_pos ,l_dec_u_s_neg], axis=0) # Item decoder model s(h(Rv)) l_dec_v_s_pos = layers.SimpleDecodeLayer([l_enc_v_h, l_in_uid_origin_pos, l_in_vid_minibatch_pos], num_units=N_u, nonlinearity=None, name='l_dec_v_s_pos') l_dec_v_s_neg = layers.SimpleDecodeLayer([l_enc_v_h, l_in_uid_origin_neg, l_in_vid_minibatch_neg], num_units=N_u, V=l_dec_v_s_pos.V, Q=l_dec_v_s_pos.Q, b=l_dec_v_s_pos.b, nonlinearity=None, name='l_dec_v_s_neg') l_dec_v_s_all = ll.ConcatLayer([l_dec_v_s_pos ,l_dec_v_s_neg], axis=0) # Likelihood model p(R) l_uv_s_train = ll.ElemwiseSumLayer([l_dec_u_s_all, l_dec_v_s_all], name='l_uv_s_train') l_r_train = ll.NonlinearityLayer(l_uv_s_train, nonlinearity=ln.sigmoid, name='l_r_train') l_uv_s_test = ll.ElemwiseSumLayer([l_dec_u_s_pos, l_dec_v_s_pos], name='l_uv_s_test') l_r_test = ll.NonlinearityLayer(l_uv_s_test, nonlinearity=ln.sigmoid, name='l_r_test') #---------------------------------------------------------------- # Likelihood and RMSE # training p_r_train, = ll.get_output([l_r_train], deterministic=False) log_p_r = T.mean(parmesan.distributions.log_bernoulli(sym_R_minibatch, p_r_train, eps=1e-6)) regularization = lasagne.regularization.regularize_network_params([l_r_train], lasagne.regularization.l2) cost_function = - log_p_r + weight_decay * regularization SE_train = T.sum(T.sqr(sym_R_minibatch - p_r_train)) # test sym_enc_u_h = T.fmatrix('enc_u_h') sym_enc_v_h = T.fmatrix('enc_v_h') enc_u_h_out, enc_v_h_out = ll.get_output([l_enc_u_h_pos, l_enc_v_h], deterministic=True) p_r_test, = ll.get_output([l_r_test], inputs={l_enc_u_h_pos:sym_enc_u_h, l_enc_v_h:sym_enc_v_h}, deterministic=True) test_scores = p_r_test.reshape((-1, 101)) ranking = test_scores.argsort()[:,::-1].argmin(axis=1) #---------------------------------------------------------------- # Gradients clip_grad = 1 max_norm = 5 params = ll.get_all_params([l_r_train,], trainable=True) for p in params: logger.debug("%s: %s" % (p, p.get_value().shape)) grads = T.grad(cost_function, params) mgrads = lasagne.updates.total_norm_constraint(grads, max_norm=max_norm) cgrads = [T.clip(g, -clip_grad, clip_grad) for g in mgrads] #updates = lasagne.updates.adam(cgrads, params, beta1=0.9, beta2=0.999, epsilon=1e-4, learning_rate=sym_lr) updates, sym_vars_list = utils.adam(cgrads, params, beta1=0.9, beta2=0.999, epsilon=1e-4, learning_rate=sym_lr) # moving average params_avg=[] for param in params: value = param.get_value(borrow=True) params_avg.append(theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable, name=param.name + '_avg')) avg_updates = [(a, a + 0.01 * (p - a)) for p, a in zip(params, params_avg)] avg_givens = [(p, a) for p, a in zip(params, params_avg)] all_updates = updates.items() + avg_updates #---------------------------------------------------------------- # Compile # training function logger.info("Compiling train_model ...") train_model = theano.function( inputs=[sym_lr, sym_uid_origin_pos, sym_uid_minibatch_pos, sym_vid_origin_pos, sym_vid_minibatch_pos, sym_uid_origin_neg, sym_uid_minibatch_neg, sym_vid_origin_neg, sym_vid_minibatch_neg, sym_Ru_pos, sym_Ru_neg, sym_Rv, sym_R_minibatch, sym_dr_Ru_pos, sym_dr_Ru_neg, sym_dr_Rv], outputs=[log_p_r, SE_train], updates=all_updates, ) # encoders logger.info("Compiling encode_model ...") u_encode_model = theano.function(inputs=[sym_Ru_pos], outputs=enc_u_h_out) v_encode_model = theano.function(inputs=[sym_Rv], outputs=enc_v_h_out) u_encode_avg_model = theano.function(inputs=[sym_Ru_pos], outputs=enc_u_h_out, givens=avg_givens, on_unused_input='ignore') v_encode_avg_model = theano.function(inputs=[sym_Rv], outputs=enc_v_h_out, givens=avg_givens, on_unused_input='ignore') # test function logger.info("Compiling test_model ...") test_model = theano.function( inputs=[sym_uid_origin_pos, sym_uid_minibatch_pos, sym_vid_origin_pos, sym_vid_minibatch_pos, sym_enc_u_h, sym_enc_v_h], outputs=[ranking], ) test_avg_model = theano.function( inputs=[sym_uid_origin_pos, sym_uid_minibatch_pos, sym_vid_origin_pos, sym_vid_minibatch_pos, sym_enc_u_h, sym_enc_v_h], outputs=[ranking], givens=avg_givens, on_unused_input='ignore', ) #---------------------------------------------------------------- # Predict function def compute_hidden_for(for_which_set='test', avg_model=False): assert for_which_set in ['valid', 'test'] if for_which_set == 'valid': R_matrix_cond = R_matrix else: R_matrix_cond = R_matrix_for_test # preconpute hidden representation u_end = 0 while u_end < N_u: u_start, u_end = u_end, min(u_end + batch_size_u, N_u) # create user mini-batch u_batch_ids = np.arange(u_start, u_end).astype('int32') # create conditionals Ru_minibatch = R_matrix_cond[u_batch_ids,:] # encode if avg_model: np_enc_u_h[u_batch_ids] = u_encode_avg_model(Ru_minibatch) else: np_enc_u_h[u_batch_ids] = u_encode_model(Ru_minibatch) v_end = 0 while v_end < N_v: v_start, v_end = v_end, min(v_end + batch_size_v, N_v) # create item mini-batch v_batch_ids = np.arange(v_start, v_end).astype('int32') # create conditionals Rv_minibatch = R_matrix_cond[:,v_batch_ids].T # encode if avg_model: np_enc_v_h[v_batch_ids] = v_encode_avg_model(Rv_minibatch) else: np_enc_v_h[v_batch_ids] = v_encode_model(Rv_minibatch) def predict_once(which_set='test', avg_model=False): assert which_set in ['valid', 'test'] if which_set == 'valid': R_predict = R_valid else: R_predict = R_test # test statistics rankings = [] # loop users u_end = 0 while u_end < N_u: u_start, u_end = u_end, min(u_end + batch_size_u, N_u) # create user mini-batch and item mini-batch u_batch_ids = np.arange(u_start, u_end).astype('int32') vid_negative = np.asarray([np.random.choice(np.where(row)[0], 100, replace=False) for row in R_negative_matrix[u_batch_ids]], dtype='int32') vid = np.concatenate([R_predict[u_batch_ids].reshape(-1,1), vid_negative], axis=1).flatten() uid_origin = np.repeat(u_batch_ids, 101) uid_minibatch = uid_origin - u_start # get encoded vectors Ru_encoded = np_enc_u_h[u_batch_ids] if avg_model: rankings_minibatch, = test_avg_model(uid_origin, uid_minibatch, vid, vid, Ru_encoded, np_enc_v_h) else: rankings_minibatch, = test_model(uid_origin, uid_minibatch, vid, vid, Ru_encoded, np_enc_v_h) rankings.append(rankings_minibatch) rankings = np.concatenate(rankings) HR = np.mean(rankings < 10) NDCG = np.mean((rankings < 10) / np.log2(rankings + 2)) return HR, NDCG def predict(which_set='test', avg=10, avg_model=False): compute_hidden_for(for_which_set=which_set, avg_model=avg_model) HR_list = [] NDCG_list = [] for i in range(avg): hr, ndcg = predict_once(which_set=which_set, avg_model=avg_model) HR_list.append(hr) NDCG_list.append(ndcg) HR_mean = np.mean(HR_list) NDCG_mean = np.mean(NDCG_list) HR_std = np.std(HR_list) NDCG_std = np.std(NDCG_list) # print info after test finished eval_msg = which_set if not avg_model else which_set + ' (avg model)' logger.critical("%-20s HR = %.3f +- %.3f, NDCG = %.3f +- %.3f." % (eval_msg, HR_mean, HR_std, NDCG_mean, NDCG_std)) return HR_mean, NDCG_mean #---------------------------------------------------------------- # Training best_valid_result = - np.inf best_model = None best_auxiliary = None n_epocs_without_improvement = 0 minibatch_queue = Queue.Queue(maxsize=10) # function for preparing minibatches def prepare_minibatch(minibatch_list): # loop mini-batches for u_batch_ids, v_batch_ids in minibatch_list: Rv_minibatch = R_matrix[:,v_batch_ids].T Rv_minibatch[:,u_batch_ids] = 0 Ru_minibatch_neg = R_matrix[u_batch_ids,:] #Ru_minibatch_neg[:,v_batch_ids] = 0 # create training samples mini-batch T_matrix_minibatch = T_matrix[np.ix_(u_batch_ids, v_batch_ids)] T_matrix_minibatch_sparse = scipy.sparse.coo_matrix(T_matrix_minibatch) n_interactions_minibatch = T_matrix_minibatch_sparse.count_nonzero() Ru_minibatch_pos = ((T_matrix[u_batch_ids[T_matrix_minibatch_sparse.row]] < T_matrix_minibatch_sparse.data.reshape(n_interactions_minibatch, 1)) & (T_matrix[u_batch_ids[T_matrix_minibatch_sparse.row]] > 0)).astype(floatX) uid_minibatch_pos = np.arange(n_interactions_minibatch).astype('int32') uid_origin_pos = u_batch_ids[T_matrix_minibatch_sparse.row] vid_minibatch_pos = T_matrix_minibatch_sparse.col vid_origin_pos = v_batch_ids[vid_minibatch_pos] R_matrix_negative_minibatch = 1 - R_matrix[np.ix_(u_batch_ids, v_batch_ids)] R_matrix_negative_minibatch_sparse = scipy.sparse.coo_matrix(R_matrix_negative_minibatch) n_negative_total = R_matrix_negative_minibatch_sparse.count_nonzero() assert n_negative_total + n_interactions_minibatch == u_batch_ids.size * v_batch_ids.size choice_negative = np.random.choice(n_negative_total, min(n_negative_total, np.int(n_interactions_minibatch * negative_ratio)), replace=False) uid_minibatch_neg = R_matrix_negative_minibatch_sparse.row[choice_negative] uid_origin_neg = u_batch_ids[uid_minibatch_neg] vid_minibatch_neg = R_matrix_negative_minibatch_sparse.col[choice_negative] vid_origin_neg = v_batch_ids[vid_minibatch_neg] R_minibatch = np.concatenate([np.ones_like(T_matrix_minibatch_sparse.data), R_matrix_negative_minibatch_sparse.data[choice_negative] * 0]) n_pred_step = R_minibatch.shape[0] if n_pred_step == 0: raise ValueError('No interactions in this minibatch.') dr_Ru_pos = min(max(1 - 2 * np.random.rand(), 0), 0.8) dr_Ru_neg = 0.2 dr_Rv = min(max(1 - 2 * np.random.rand(), 0), 0.8) # package everything into a tuple data_minibatch_package = ( uid_origin_pos, uid_minibatch_pos, vid_origin_pos, vid_minibatch_pos, uid_origin_neg, uid_minibatch_neg, vid_origin_neg, vid_minibatch_neg, Ru_minibatch_pos, Ru_minibatch_neg, Rv_minibatch, R_minibatch, dr_Ru_pos, dr_Ru_neg, dr_Rv) # enqueue minibatch_queue.put((n_pred_step, data_minibatch_package)) logger.warning("Training started.") # loop epoch for epoch in range(1, 1+max_epoch): epoch_start_time = time.time() # training statistics LL_epoch, SE_epoch= 0, 0 n_pred_epoch = 0 u_order = np.array_split(np.random.permutation(N_u).astype('int32'), N_u // batch_size_u + 1) v_order = np.array_split(np.random.permutation(N_v).astype('int32'), N_v // batch_size_v + 1) minibatch_order = list(itertools.product(u_order, v_order)) random.shuffle(minibatch_order) n_threads = 5 n_minibatch_thread = len(minibatch_order) // n_threads + 1 for t in range(n_threads): thr = threading.Thread(target=prepare_minibatch, args=(minibatch_order[t*n_minibatch_thread:(t+1)*n_minibatch_thread],)) thr.setDaemon(True) thr.start() for step in range(len(minibatch_order)): n_pred_step, data_minibatch_package = minibatch_queue.get() # update parameters and calculate likelihood and RMSE LL_step, SE_step = train_model(lr, *data_minibatch_package) minibatch_queue.task_done() LL_epoch += LL_step * n_pred_step SE_epoch += SE_step n_pred_epoch += n_pred_step assert minibatch_queue.qsize() == 0 # print info after epoch finished LL_epoch /= n_pred_epoch RMSE_epoch = np.sqrt(SE_epoch/n_pred_epoch) epoch_end_time = time.time() logger.info("Epoch %d, training RMSE = %f, LL = %f (%d training ratings). Elapsed time %.1fs." % (epoch, RMSE_epoch, LL_epoch, n_pred_epoch, epoch_end_time-epoch_start_time)) # validation HR_valid, NDCG_valid = predict('valid') HR_test, NDCG_test = predict('test') HR_test, NDCG_test = predict('test', avg_model=True) # termination #if NDCG_valid > best_valid_result: if HR_valid > best_valid_result: n_epocs_without_improvement = 0 #best_valid_result = NDCG_valid best_valid_result = HR_valid best_model = ll.get_all_param_values([l_r_train,], trainable=True) best_auxiliary = utils.get_all_shvar_values(sym_vars_list) logger.debug("New best model found!") else: n_epocs_without_improvement += 1 if n_epocs_without_improvement >= lookahead: ll.set_all_param_values([l_r_train,], best_model, trainable=True) utils.set_all_shvar_values(sym_vars_list, best_auxiliary) if lr > 1e-5: n_epocs_without_improvement = 0 lr /= 4 logger.error("Learning rate = %f now." % lr) else: logger.error("Training finished.") break #---------------------------------------------------------------- # Test HR_test, NDCG_test = predict('test') HR_test, NDCG_test = predict('test', avg_model=True) #---------------------------------------------------------------- # Summarization for k, v in six.iteritems(args_dict): logger.info(" %20s: %s" % (k, v))
import sys, os import xml.dom.minidom import subprocess import signal, utils import splunk.entity as en from service import Protocol logger = utils.setup_logging("rpcstart") SCHEME = """<scheme> <title>Splunk RPC Startup</title> <description>Start up RPC service server.</description> <use_external_validation>true</use_external_validation> <streaming_mode>xml</streaming_mode> <endpoint> <args> <arg name="name"> <title>Resource name</title> <description> Java RPC server name </description> </arg> <arg name="javapath"> <title>Java Installation</title> </arg> <arg name="options"> <title>Java Options</title> </arg>
def main( mistakes_path: Path, outdir: Path, plan_iters: int = 10, optim: Literal["sgd", "adam"] = "sgd", lr: float = 0.1, momentum: bool = False, nesterov: bool = False, extra_inits: bool = False, replications: Optional[str] = None, log_time: bool = False, log_best_inits: bool = False, n_traj_max: Optional[int] = None, verbosity: Literal["INFO", "DEBUG"] = "INFO", ): outdir = Path(outdir) experiment_dir = outdir / make_experiment( optim, lr, plan_iters, momentum, nesterov, extra_inits ) experiment_dir.mkdir(parents=True, exist_ok=True) setup_logging(verbosity=verbosity, log_path=experiment_dir / "log.txt") if replications is not None: replication_indices = parse_replications(replications) mistakes_paths = [ Path(mistakes_path) / str(index) / "planner_mistakes.pkl" for index in replication_indices ] else: mistakes_paths = [Path(mistakes_path)] if optim == "sgd": optimizer = SGD(learning_rate=lr, momentum=momentum, nesterov=nesterov) elif optim == "adam": optimizer = Adam(learning_rate=lr) env = LegacyEnv(reward=np.zeros(4)) starts, rewards, better_trajs = collect_mistakes( mistakes_paths=mistakes_paths, n_max=n_traj_max ) init_controls = ( np.array( [ [[0.0, 1.0]] * 50, [[0.0, -1.0]] * 50, [[-0.5, -1.0]] * 50, [[0.5, -1.0]] * 50, [[0.5, 1.0]] * 50, [[-0.5, 1.0]] * 50, ] ) if extra_inits else None ) logging.info("Making trajectories") opt_trajs, losses = make_opt_trajs( traj_opt=TrajOptimizer( n_planner_iters=plan_iters, optim=optimizer, init_controls=init_controls, log_best_init=log_best_inits, ), rewards=rewards, starts=starts, log_time=log_time, ) logging.info("Rolling out trajectories") returns = np.empty((len(starts), 2)) for i, (start, reward_weights, opt_traj, policy_traj, loss) in enumerate( zip(starts, rewards, opt_trajs, better_trajs, losses) ): env.reward = reward_weights traj_opt_return = rollout(actions=opt_traj, env=env, start=start) policy_return = rollout(actions=policy_traj, env=env, start=start) assert ( abs(traj_opt_return + loss) < 0.001 ), f"Rollout={traj_opt_return} and loss={loss}, differ by too much. start={start}, reward={reward_weights}" returns[i, 0] = traj_opt_return returns[i, 1] = policy_return logging.debug( f"Traj opt return={traj_opt_return}, loss={loss}, policy_return={policy_return}, delta={traj_opt_return-policy_return}" ) np.save(experiment_dir / "returns.npy", returns) deltas = returns[:, 0] - returns[:, 1] logging.info( f"Mean delta={np.mean(deltas)}, mean better={np.mean(deltas > 0)*100:.1f}%, optim={optim}, lr={lr}, n={plan_iters}, momentum={momentum}, nesterov={nesterov}, extra inits={extra_inits}" ) plot_returns(returns, experiment_dir)
def main() -> None: # Parse start arguments parser = argparse.ArgumentParser( description="Moonraker - Klipper API Server") parser.add_argument("-c", "--configfile", default="~/moonraker.conf", metavar='<configfile>', help="Location of moonraker configuration file") parser.add_argument("-l", "--logfile", default="/tmp/moonraker.log", metavar='<logfile>', help="log file name and location") parser.add_argument("-n", "--nologfile", action='store_true', help="disable logging to a file") cmd_line_args = parser.parse_args() cfg_file = cmd_line_args.configfile app_args = {'config_file': cfg_file} # Setup Logging version = utils.get_software_version() if cmd_line_args.nologfile: app_args['log_file'] = "" else: app_args['log_file'] = os.path.normpath( os.path.expanduser(cmd_line_args.logfile)) app_args['software_version'] = version ql, file_logger, warning = utils.setup_logging(app_args) if warning is not None: app_args['log_warning'] = warning if sys.version_info < (3, 7): msg = f"Moonraker requires Python 3.7 or above. " \ f"Detected Version: {sys.version}" logging.info(msg) print(msg) ql.stop() exit(1) # Start asyncio event loop and server event_loop = EventLoop() alt_config_loaded = False estatus = 0 while True: try: server = Server(app_args, file_logger, event_loop) server.load_components() except confighelper.ConfigError as e: backup_cfg = confighelper.find_config_backup(cfg_file) if alt_config_loaded or backup_cfg is None: logging.exception("Server Config Error") estatus = 1 break app_args['config_file'] = backup_cfg app_args['config_warning'] = ( f"Server configuration error: {e}\n" f"Loaded server from most recent working configuration:" f" '{app_args['config_file']}'\n" f"Please fix the issue in moonraker.conf and restart " f"the server.") alt_config_loaded = True continue except Exception: logging.exception("Moonraker Error") estatus = 1 break try: event_loop.register_callback(server.server_init) event_loop.start() except Exception: logging.exception("Server Running Error") estatus = 1 break if server.exit_reason == "terminate": break # Restore the original config and clear the warning # before the server restarts if alt_config_loaded: app_args['config_file'] = cfg_file app_args.pop('config_warning', None) alt_config_loaded = False event_loop.close() # Since we are running outside of the the server # it is ok to use a blocking sleep here time.sleep(.5) logging.info("Attempting Server Restart...") for _ in range(5): # Sometimes the new loop does not properly instantiate. # Give 5 attempts before raising an exception new_loop = asyncio.new_event_loop() if not new_loop.is_closed(): break logging.info("Failed to create open eventloop, " "retyring in .5 seconds...") time.sleep(.5) else: raise RuntimeError("Unable to create new open eventloop") asyncio.set_event_loop(new_loop) event_loop.reset() event_loop.close() logging.info("Server Shutdown") ql.stop() exit(estatus)
'--functions', help='name of the function file', required=True, default='/storage/users/cnalab/apkdata-tanya/binary/new.large.sframe') parser.add_argument( '--net', help='name of a network file (extraction only for anchors)') parser.add_argument('--output', help='output path', required=True) args = parser.parse_args() #test_file = '/storage/users/cnalab/apkdata-tanya/binary/test-tc-1000.npy' #if test_file: # print(f"Reading test file: {test_file}") # test_apns = np.load(test_file) path = setup_path(args=args) setup_logging(path=path, parser=parser) net_file = args.net logging.info(f"Reading reading net file {net_file}") gamma, net = load_net(net_file) test_apns = list(net.keys()) logging.info(f"Extracted apn: {len(test_apns)}") setup_turi() tc.config.set_runtime_config('TURI_DEFAULT_NUM_PYLAMBDA_WORKERS', 16) logging.info('Loading functions') mw = load_functions_partition(directory='', name=args.functions) logging.info('Filter started') test_f = mw.filter_by(values=test_apns, column_name='apk')
parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint') parser.add_argument('--results_dir', metavar='RESULTS_DIR', default='./results', help='results dir') parser.add_argument('--resume_dir', default=None, help='resume dir') args = parser.parse_args() args.save = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt')) logging.info("saving to %s", save_path) logging.info("run arguments: %s", args) use_cuda = torch.cuda.is_available() best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ])
ap.add_argument('installdir') ap.add_argument('tarball') args = ap.parse_args() if not exists(args.installdir): print 'directory {} does not exist'.format(args.installdir) sys.exit(1) if os.listdir(args.installdir): print 'directory {} is not empty'.format(args.installdir) sys.exit(1) if not exists(args.tarball): print 'file {} does not exist'.format(args.tarball) sys.exit(1) m = re.match(r'^.*?_([\d\.]+).*?\.tar\.gz$', basename(args.tarball)) version = m.group(1) cfg = ServerConfig(installdir=args.installdir, tarball=args.tarball, version=version) setup_server(cfg, args.db) start_server(cfg) create_test_user(cfg) if __name__ == '__main__': setup_logging() main()
def main(): # Parse start arguments parser = argparse.ArgumentParser( description="Moonraker - Klipper API Server") parser.add_argument("-c", "--configfile", default="~/moonraker.conf", metavar='<configfile>', help="Location of moonraker configuration file") parser.add_argument("-l", "--logfile", default="/tmp/moonraker.log", metavar='<logfile>', help="log file name and location") parser.add_argument("-n", "--nologfile", action='store_true', help="disable logging to a file") system_args = parser.parse_args() # Setup Logging version = utils.get_software_version() if system_args.nologfile: log_file = "" else: log_file = os.path.normpath(os.path.expanduser(system_args.logfile)) system_args.logfile = log_file system_args.software_version = version ql, file_logger = utils.setup_logging(log_file, version) if sys.version_info < (3, 7): msg = f"Moonraker requires Python 3.7 or above. " \ f"Detected Version: {sys.version}" logging.info(msg) print(msg) ql.stop() exit(1) # Start IOLoop and Server io_loop = IOLoop.current() estatus = 0 while True: try: server = Server(system_args, file_logger) except Exception: logging.exception("Moonraker Error") estatus = 1 break try: server.start() io_loop.start() except Exception: logging.exception("Server Running Error") estatus = 1 break # Since we are running outside of the the server # it is ok to use a blocking sleep here time.sleep(.5) logging.info("Attempting Server Restart...") io_loop.close(True) logging.info("Server Shutdown") ql.stop() exit(estatus)
import RPi.GPIO as GPIO import time import sensor_repo as sr import utils CONTEXT = "vlotter" PIN = 36 utils.setup_logging(CONTEXT) repo = sr.sensor_repo() GPIO.setmode(GPIO.BOARD) GPIO.setup(PIN, GPIO.IN) try: repo.set_value(CONTEXT, "0") high_level_cnt = 0 while True: if GPIO.input(PIN) == 0: high_level_cnt += 1 else: time.sleep(5) utils.retry_if_none(lambda: repo.set_value(CONTEXT, "0")) high_level_cnt = 0 if high_level_cnt > 100: utils.retry_if_none(lambda: repo.set_value(CONTEXT, "1")) time.sleep(0.1)
import sys, os import xml.dom.minidom import utils import opc import splunk.entity as en logger = utils.setup_logging("opcmeasure") SCHEME = """<scheme> <title>OPC DA Collector</title> <description>Setup opc measure.</description> <use_external_validation>true</use_external_validation> <streaming_mode>xml</streaming_mode> <endpoint> <args> <arg name="name"> <title>OPC DA Collector</title> <description>OPC measure name </description> </arg> <arg name="server"> <title>Opc Server</title> <description>Opc Server alias that is configured in opcservers.conf.</description> </arg> <arg name="measures"> <title>Measure Items</title> <description>Separated with semicolon ; if multiple.</description> </arg>
def main(): global args args = parser.parse_args() if args.save is '': args.save = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') save_path = os.path.join(args.results_dir, args.save) if not os.path.exists(save_path): os.makedirs(save_path) setup_logging(os.path.join(save_path, 'log.txt')) checkpoint_file = os.path.join(save_path, 'checkpoint_epoch_%s.pth.tar') logging.debug("run arguments: %s", args) logging.info("using pretrained cnn %s", args.cnn) cnn = resnet.__dict__[args.cnn](pretrained=True) vocab = build_vocab() model = CaptionModel(cnn, vocab, embedding_size=args.embedding_size, rnn_size=args.rnn_size, num_layers=args.num_layers, share_embedding_weights=args.share_weights) train_data = get_iterator(get_coco_data(vocab, train=True), batch_size=args.batch_size, max_length=args.max_length, shuffle=True, num_workers=args.workers) val_data = get_iterator(get_coco_data(vocab, train=False), batch_size=args.eval_batch_size, max_length=args.max_length, shuffle=False, num_workers=args.workers) if 'cuda' in args.type: cudnn.benchmark = True model.cuda() optimizer = select_optimizer(args.optimizer, params=model.parameters(), lr=args.lr) regime = lambda e: { 'lr': args.lr * (args.lr_decay**e), 'momentum': args.momentum, 'weight_decay': args.weight_decay } model.finetune_cnn(False) def forward(model, data, training=True, optimizer=None): use_cuda = 'cuda' in args.type loss = nn.CrossEntropyLoss() perplexity = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() if training: model.train() else: model.eval() end = time.time() for i, (imgs, (captions, lengths)) in enumerate(data): data_time.update(time.time() - end) if use_cuda: imgs = imgs.cuda() captions = captions.cuda(async=True) imgs = Variable(imgs, volatile=not training) captions = Variable(captions, volatile=not training) input_captions = captions[:-1] target_captions = pack_padded_sequence(captions, lengths)[0] pred, _ = model(imgs, input_captions, lengths) err = loss(pred, target_captions) perplexity.update(math.exp(err.data[0])) if training: optimizer.zero_grad() err.backward() clip_grad_norm(model.rnn.parameters(), args.grad_clip) optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: logging.info( '{phase} - Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Perplexity {perp.val:.4f} ({perp.avg:.4f})'.format( epoch, i, len(data), phase='TRAINING' if training else 'EVALUATING', batch_time=batch_time, data_time=data_time, perp=perplexity)) return perplexity.avg for epoch in range(args.start_epoch, args.epochs): if epoch >= args.finetune_epoch: model.finetune_cnn(True) optimizer = adjust_optimizer(optimizer, epoch, regime) # Train train_perp = forward(model, train_data, training=True, optimizer=optimizer) # Evaluate val_perp = forward(model, val_data, training=False) logging.info('\n Epoch: {0}\t' 'Training Perplexity {train_perp:.4f} \t' 'Validation Perplexity {val_perp:.4f} \n'.format( epoch + 1, train_perp=train_perp, val_perp=val_perp)) model.save_checkpoint(checkpoint_file % (epoch + 1))
import logging import argparse from utils import setup_logging from telegram.client import Telegram """ Prints short description of a webpage (using Telegram's instant view) Usage: python examples/get_instant_view.py api_id api_hash phone https://hackernoon.com/im-harvesting-credit-card-numbers-and-passwords-from-your-site-here-s-how-9a8cb347c5b5 """ if __name__ == '__main__': setup_logging(level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('api_id', help='API id') # https://my.telegram.org/apps parser.add_argument('api_hash', help='API hash') parser.add_argument('phone', help='Phone') parser.add_argument('url', help='Webpage URL') args = parser.parse_args() tg = Telegram( api_id=args.api_id, api_hash=args.api_hash, phone=args.phone, database_encryption_key='changeme1234', ) # you must call login method before others tg.login()
import argparse from telegram_api.client import Telegram import utils if __name__ == '__main__': utils.setup_logging() parser = argparse.ArgumentParser() utils.add_api_args(parser) utils.add_proxy_args(parser) args = parser.parse_args() tg = Telegram(api_id=args.api_id, api_hash=args.api_hash, phone=args.phone, database_encryption_key='changeme1234', proxy_server=args.proxy_server, proxy_port=args.proxy_port, proxy_type=utils.parse_proxy_type(args)) # you must call login method before others tg.login() result = tg.get_me() result.wait() print(result.update)
formatter_class=RawTextHelpFormatter, description='''Script for adding data into data-aquistion service. Input data must be in json format. Example data can be created using create_simple_data.py. Data format (json) is list of dictionaries, each dict contains basic information on dataset: [ { "category": DATA_CATEGORY, "orgUUID": ORGANISATION NAME (NOT UUID, uuid will be found from name), "publicRequest": false/true, "source": URL OF FILE TO ADD, "title": FILE NAME }, { ... }, ... ] ''') parser.add_argument( 'token', help="OAUTH token. For delete and insert it must have admin privileges") parser.add_argument('file', help="Input file in json format") parser.add_argument('--debug', action="store_true", help="Debug logging") args = parser.parse_args() from cf_config import URL, CF_URL setup_logging(debug=args.debug) parse_and_send_data(args.token, args.file)
choices=[0, 1, 2, 3, 4]) parser.add_argument("-inspect", default=False, action="store_true") parser.add_argument("-hidden", type=int, default=128) parser.add_argument("-c", default=0.1, type=float) parser.add_argument("-xi", default=0.1, type=float) parser.add_argument("-lr", default=0.0001, type=float) parser.add_argument("-nepochs", default=20, type=int) args = parser.parse_args() config = { 'overwrite_name': 'si-h%d-lr%g-c%g-xi%g-dataset%d' % (args.hidden, args.lr, args.c, args.xi, args.dataset), } utils.setup_logging(args.seed, config['overwrite_name']) print("Seed: %d" % args.seed) session_config = utils.set_seed(args.seed, args.dataset) n_permute_tasks, it, layer_sizes = utils.setup_dataset(args.dataset, args.inspect) config = { **config, 'c': args.c, 'xi': args.xi, 'lr': args.lr, } if args.hidden != None: layer_sizes = layer_sizes[:1] + [ args.hidden for ln in range(len(layer_sizes) - 2)
import splunk.entity as en import jdbc, utils logger = utils.setup_logging("rpcinits") def load_db(config): ents = en.getEntities(["admin", "conf-inputs"], namespace="splunk-demo-opcda", owner="nobody", sessionKey=config["session_key"], hostPath=config["server_uri"]) # logger.debug("%s" % ents) for dbn, dbv in [(n, v) for n, v in ents.items() if n.startswith("database://")]: name = dbn.replace("database://", "") logger.debug("name=%s" % name) logger.debug("values=%s" % dbv) jdbc.updateDatabase(name, dbv["dburl"], dbv["jdbcdriver"], dbv["user"], dbv["password"], dbv["parameters"])
import sys sys.path.append("..") import logging from dag import Experiment, Recipe import dill import os import utils logger = logging.getLogger("main") utils.setup_logging(debug=True) directory = "../output/08-06-19_seed4" experiment = Experiment(directory=directory) # this materializes immediately x = experiment.spawn_new_tree( dataset_name="mnist", model_name="models.LeNet", init_schema="", optimizer_name="sgd", seed=4, ) x = Recipe(train={"n_epochs": 30})(x) for _ in range(20): # finetune pruned = Recipe( prune_schema="../schemas/pruning_schema_lenet_structuredl1.py", )(x) x = Recipe(
from threading import Thread import cli_args as cli from cli_args import LOG_LEVEL, GRPC_HOST from cli_args import setup_cli_args from flask import Flask from location_client import LocationClient from utils import setup_logging from utils import sleep if __name__ == "__main__": # Parse CLI args args = setup_cli_args(cli.grpc_host, cli.verbose) # Setup logging setup_logging(level=args[LOG_LEVEL]) http = Flask(__name__) @http.route("/count") def val_count(): global count return "Read {0} values".format(count) def read_values(): global count while True: print("Got location: {0}".format(client.get_xy())) count += 1 # Start client
def test_setup_logging(tmp_path): config = Namespace(verbose=True, output_dir=tmp_path) logger = setup_logging(config) assert logger.level == logging.INFO assert isinstance(logger, logging.Logger) assert next(tmp_path.rglob("*.log")).is_file()
def main(args): # Setup logging logger = setup_logging(args) # Read params of model params = fetch_model_params(args.model) # Fetch appropriate input functions input_fn = params.get("input_fn", "sequential_input") if input_fn == "sequential_input": input_fn = sequential_input elif input_fn == "generic_text": input_fn = generic_text pred_input_fn = pred_input handle_pred_output_fn = handle_pred_output # get current step current_step = int(estimator_lib._load_global_step_from_checkpoint_dir(params["model_path"])) logger.info(f"Current step {current_step}") if params["mlm_training"]: mlm_sample_text_fn = partial(mlm_sample_text, params) input_fn = partial(generic_text, sample_text_fn=mlm_sample_text_fn) if args.check_dataset: check_dataset(input_fn, params) # Fetch encoder per params encoder = fetch_encoder(params) pred_input_fn = partial(pred_input_fn, path_to_prompt=args.prompt, logger=logger, enc=encoder) # Sample from Dataset if check dataset flag is on if args.check_dataset: check_dataset(input_fn, params, global_step=current_step) # Confirm deletion of checkpoint files if --new flag is set if args.new: if yes_or_no(f"Are you sure you want to remove '{params['model_path']}' to start afresh?"): remove_gs_or_filepath(params["model_path"]) else: exit() # Save config to logdir for experiment management save_config(params, params["model_path"]) # Add to params: auto_layout, auto_layout_and_mesh_shape, use_tpu, num_cores mesh_shape = mtf.convert_to_shape(params["mesh_shape"]) params["num_cores"] = mesh_shape.size params["auto_layout"] = args.auto_layout params["auto_layout_and_mesh_shape"] = args.auto_layout_and_mesh_shape params["use_tpu"] = True if not args.tpu is None else False params["gpu_ids"] = args.gpu_ids params["steps_per_checkpoint"] = args.steps_per_checkpoint # Expand attention types param params["attention_types"] = expand_attention_types_params(params["attention_types"]) assert len(params["attention_types"]) == params["n_layer"] # Assert that the length of expanded list = num layers params["predict_batch_size"] = params.get("predict_batch_size", 1) # Default to 1 params["predict"] = args.predict params['model'] = params.get("model", "GPT") # Default model selection to GPT since it's the only option for now params["export"] = args.export # Set sampling parameters params["sampling_use_entmax"] = args.entmax_sampling # Sample quality of MoE models suffers when using the faster sampling method, so default to slow_sampling if # moe layers are present params["slow_sampling"] = True if params["moe_layers"] is not None else False logger.info(f"params = {params}") # Get eval tasks from params eval_tasks = params.get("eval_tasks", []) has_predict_or_eval_steps_or_eval_tasks = params["predict_steps"] > 0 or params["eval_steps"] > 0 or len( eval_tasks) > 0 for t in eval_tasks: assert t in task_descriptors, f"Eval task '{t}' is not known" task_descriptors[t]["init_fn"](params) # Set up TPUs and Estimator if args.tpu == "colab": tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver() if params["use_tpu"] else None else: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(args.tpu) if params["use_tpu"] else None config = tpu_config.RunConfig( cluster=tpu_cluster_resolver, model_dir=params["model_path"], save_checkpoints_steps=None, # Disable the default saver save_checkpoints_secs=None, # Disable the default saver log_step_count_steps=params["iterations"], save_summary_steps=params["iterations"], tpu_config=tpu_config.TPUConfig( num_shards=mesh_shape.size, iterations_per_loop=params["iterations"], num_cores_per_replica=1, per_host_input_for_training=tpu_config.InputPipelineConfig.BROADCAST)) estimator = tpu_estimator.TPUEstimator( use_tpu=params["use_tpu"], model_fn=model_fn, config=config, train_batch_size=params["train_batch_size"], eval_batch_size=params["train_batch_size"], predict_batch_size=params["predict_batch_size"], params=params) def _make_task_estimator(task): task_params = params.copy() task_params["eval_task"] = task return tpu_estimator.TPUEstimator( use_tpu=params["use_tpu"], model_fn=model_fn, config=config, train_batch_size=params["train_batch_size"], eval_batch_size=params["eval_batch_size"], predict_batch_size=params["predict_batch_size"], params=task_params) eval_task_estimators = { task: _make_task_estimator(task) for task in eval_tasks } if args.export: export_model(estimator, "export", params) return if args.predict: # Predict predictions = estimator.predict(input_fn=pred_input_fn) logger.info("Predictions generated") enc = fetch_encoder(params) handle_pred_output_fn(predictions, logger, enc, params, out_name=f"predictions_{args.sacred_id}_{current_step}") return def save_eval_results(task, eval_results): def as_python(x): if isinstance(x, numpy.generic): return x.item() return x eval_results = {k: as_python(v) for k, v in eval_results.items()} with open(f'eval_{args.sacred_id}.jsonl', 'a') as fh: json.dump({'task': task, 'current_step': current_step, **eval_results}, fh) fh.write('\n') def run_eval(): logger.info("Running evaluation...") eval_results = estimator.evaluate( input_fn=partial(input_fn, eval=True), steps=params["eval_steps"]) logger.info(f"Eval results: {eval_results}") save_eval_results('validation', eval_results) def run_eval_tasks(): for task in eval_tasks: logger.info(f"Starting evaluation task '{task}'") task_info = task_descriptors[task]["get_task_info_fn"](params) task_estimator = eval_task_estimators[task] task_input_fn = task_descriptors[task]["input_fn"] eval_results = task_estimator.evaluate( input_fn=task_input_fn, steps=task_info["n_steps"], name=task) logger.info(f"Eval task '{task}' results: {eval_results}") save_eval_results(task, eval_results) if args.eval: run_eval_tasks() if params["eval_steps"] > 0: run_eval() return elif has_predict_or_eval_steps_or_eval_tasks: # Eval and train - stop and predict and/or eval every checkpoint while current_step < params["train_steps"]: next_checkpoint = min(current_step + args.steps_per_checkpoint, params["train_steps"]) estimator.train(input_fn=partial(input_fn, global_step=current_step, eval=False), max_steps=next_checkpoint) current_step = next_checkpoint if params["predict_steps"] > 0: logger.info("Running prediction...") predictions = estimator.predict(input_fn=pred_input_fn) enc = fetch_encoder(params) handle_pred_output_fn(predictions, logger, enc, params, out_name=f"predictions_{args.sacred_id}_{current_step}") if params["eval_steps"] > 0: run_eval() if eval_tasks: run_eval_tasks() return else: # Else, just train, don't stop and restart estimator.train(input_fn=partial(input_fn, global_step=current_step, eval=False), max_steps=params["train_steps"])
super(IPKernelApp, self).initialize( argv) # Skipping IPKernelApp.initialize on purpose self.init_connection_file() self.init_poller() self.init_sockets() self.init_heartbeat() self.init_signal() def initialize_kernel(self): self.kernel = ForwardingKernel(parent=self) if __name__ == '__main__': RABBIT_MQ_ADDRESS = (os.environ['MQ_HOST'], int(os.environ['MQ_PORT'])) RABBIT_MQ_CREDENTIALS = (os.environ['MQ_USER'], os.environ['MQ_PASS']) MISSED_HEARTBEAT_LIMIT = int(os.environ['MISSED_HEARTBEAT_LIMIT']) HEARTBEAT_INTERVAL = float(os.environ['HEARTBEAT_INTERVAL']) KERNEL_NAME = os.environ['KERNEL_NAME'] app = ForwardingKernelApp.instance() app.initialize() setup_logging( os.path.join( os.getcwd(), 'forwarding_kernel_logs', 'kernel_{}.log'.format( ForwardingKernel.get_kernel_id(app.connection_file)))) app.initialize_kernel() app.start()