def run():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-dir', required=True)
    parser.add_argument('--directory-to-write', required=True)
    parser.add_argument('--model-dir', required=True)
    parser.add_argument('--log-level', default='debug')
    parser.add_argument('--fresh-start', action="store_true")
    parser.add_argument('--num-of-process', default=cpu_count() - 1, type=int)
    parser.add_argument('--check-pool-every',
                        default=150,
                        type=int,
                        help="It checks multiprocessing.Pool is in stuck "
                        "in every n seconds")
    args = parser.parse_args()

    input_dir = args.input_dir
    output_dir = args.directory_to_write
    model_dir = args.model_dir

    utils.configure_logger(args.log_level)
    logger = utils.get_logger()
    logger.debug("Args: {}".format(args))

    predict(model_dir,
            input_dir,
            output_dir,
            num_of_process=args.num_of_process,
            fresh_start=args.fresh_start,
            check_pool_every=args.check_pool_every)
    logger.info('Done')
Beispiel #2
0
def main(args):
    # given program arguments, generate a config file
    config = cfg.generate_config(args)

    # if given a best state then we load it's config
    if args.state:
        logging.info('loading config from {}'.format(args.state))
        best_state = torch.load(args.state)
        config = best_state['config']

    # create a checkpoint directory
    model_dir = utl.generate_experiment_dir(args.model_dir,
                                            config,
                                            prefix_str='S3DIS-hilbert')

    # configure logger
    utl.configure_logger(model_dir, args.loglevel.upper())

    # get Tensorboard writer object
    writer = utl.get_tensorboard_writer(log_dir=model_dir)

    train(config=config, model_dir=model_dir, writer=writer)

    # close Tensorboard writer
    writer.close()
Beispiel #3
0
def configure_logging():
    """
        Configure the loggers for Talos. Sets up the Talos loggers
        and discord.py loggers separately, so they can be easily configured
        independently.
    """
    fh = logging.FileHandler(utils.log_folder / "dtalos.log")
    dfh = logging.FileHandler(utils.log_folder / "dpy.log")
    sh = logging.StreamHandler(sys.stderr)
    gh = None
    try:
        import google.cloud.logging as glog
        client = glog.Client()
        gh = client.get_default_handler()
        gh.name = "dtalos"
        gh.setLevel(logging.WARNING)
    except (ImportError, OSError):
        pass

    ff = logging.Formatter("%(levelname)s:%(name)s:%(message)s")

    dlog = logging.getLogger("discord")

    utils.configure_logger(log, handlers=[fh, sh, gh], formatter=ff, level=logging.INFO, propagate=False)
    utils.configure_logger(dlog, handlers=[dfh, sh], formatter=ff, level=logging.INFO, propagate=False)
def run():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--filename', required=False, default='wikipages.txt')
    parser.add_argument('--no-fetching-links',
                        required=False,
                        default=False,
                        action="store_true")
    parser.add_argument('--num-process',
                        help="Number of process for parallel processing",
                        required=False,
                        default=1,
                        type=int)
    parser.add_argument('--log-level', required=False, default="info")
    args = parser.parse_args()

    configure_logger(args.log_level)
    logger = utils.get_logger()

    fetching_links = not args.no_fetching_links

    logger.info("Input file: {}".format(args))

    directory = '../datasets/wiki-new/'
    try:
        os.mkdir(directory)
    except OSError:
        logger.debug("{} is already exist".format(directory))

    extract_from_file(args.filename, args.num_process, directory,
                      fetching_links)
Beispiel #5
0
def main(fileName, top_directory, maxmimum, minSilenceLength,
         silence_threshold, samplerate, gain):
    # Create log directory and configure logging
    duration = 0
    work_on_single_file = fileName is not None
    if work_on_single_file and not os.path.exists(fileName):
        print('Could not find the audio-file {}.'.format(fileName))
        usage()
    if maximum < 10 or maximum > 23:
        print('Audio duration should be between 5 and 20 seconds!')
        usage()
    if minSilenceLength > 0 and (minSilenceLength > 1000
                                 or minSilenceLength < 150):
        print(
            'Minimum silence length value should be between 150msec and 1000msec!'
        )
        usage()
    if samplerate > 0 and (samplerate > 44100 or samplerate < 16000):
        print('Sample rate should be between 16000 and 44100')
        usage()

    logPath = os.path.join(os.getcwd(), 'log')
    if not os.path.exists(logPath):
        os.makedirs(logPath)
    configure_logger(logFileName=os.path.join(logPath, 'vlog.log'))

    # Check if the file exists in the data directory
    if work_on_single_file:
        duration = segmentAudioFile(fileName,
                                    maximum=maximum,
                                    minSilenceLength=minSilenceLength,
                                    silence_threshold=silence_threshold,
                                    samplerate=samplerate,
                                    gain=gain)
        logger.info('-' * 80)
        logger.info('TOTAL DURATION of the FILES is {} '.format(duration))
    else:
        fileList = getfiles(top_directory)
        # Create the directory where the file segments to be written
        for fileName in filter(
                lambda fileName: fileName.endswith('.wav') or fileName.
                endswith('.mp3'), fileList):
            filePath = os.path.join(top_directory, fileName)
            temp_duration = segmentAudioFile(
                filePath,
                maximum=maximum,
                minSilenceLength=minSilenceLength,
                silence_threshold=silence_threshold,
                samplerate=samplerate,
                gain=gain)
            if temp_duration is not None:
                duration += temp_duration
            else:
                break
        logger.info('-' * 80)
        logger.info('TOTAL DURATION of the FILES is \'hh:mm:ss\' {} '.format(
            convertMilliseconsTime(duration)))
Beispiel #6
0
def run():
    utils.configure_logger('debug')
    logger = utils.get_logger()
    input_directory = sys.argv[1]  # '../datasets/wiki-filtered'
    out_directory = sys.argv[2]  # '../datasets/wiki-senses'
    files = os.listdir(input_directory)
    files = [os.path.join(input_directory, f) for f in files]
    logger.info('total number of files: %d' % len(files))
    create_sense_dataset(files, out_directory)
    logger.info('done')
def run():
    utils.configure_logger('debug')
    logger = utils.get_logger()
    input_directory = sys.argv[1]
    out_directory = sys.argv[2]
    num_of_fold = int(sys.argv[3])
    files = os.listdir(input_directory)
    files = [os.path.join(input_directory, f) for f in files]
    logger.info('total number of files: %d' % len(files))
    create_IMS_formatted_dataset(files, out_directory, k=num_of_fold, num_of_process=30)
    logger.info('done')
def run():
    utils.configure_logger('debug')
    logger = utils.get_logger()
    input_directory = sys.argv[1]
    out_directory = sys.argv[2]
    num_of_fold = int(sys.argv[3])
    num_of_processor = int(sys.argv[4])
    files = os.listdir(input_directory)
    files = [os.path.join(input_directory, f) for f in files]
    logger.info('total number of files: %d' % len(files))
    create_IMS_formatted_dataset(files, out_directory, k=num_of_fold, num_of_process=num_of_processor)
    logger.info('done')
def run():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-file', required=True)
    parser.add_argument('--model-dir', required=True)
    parser.add_argument('--write-every-n-line', default=200000, type=int)
    parser.add_argument('--directory-to-write', default='/tmp/mt-data')
    parser.add_argument('--log-level', default='debug')
    args = parser.parse_args()

    utils.configure_logger(args.log_level)
    logger = utils.get_logger()
    logger.debug("Args: {}".format(args))

    preprocess_mt_input_file(args.input_file, args.model_dir, args.directory_to_write, args.write_every_n_line)
    logger.info('Done')
Beispiel #10
0
def process(statsfile, k, optfile=None):
    stats = utils.load_pickle(statsfile)
    track_ar = average_rank_per_track(stats)
    clique_ar = average_rank_per_clique(stats)
    ma_p = mean_average_precision(stats)
    #k_p = average_precision(stats, k, ver=True)
    k_p = average_precision_at_k(stats, k)

    # Set up logger
    logger = utils.configure_logger()

    # print results
    logger.info("Number of queries: %d" % len(stats))
    logger.info("Average Rank per Track: %.3f" % track_ar)
    logger.info("Average Rank per Clique: %.3f" % clique_ar)
    logger.info("Mean Average Precision: %.2f %%" % (ma_p * 100))
    logger.info("Precision at %d: %.2f %%" % (k, k_p * 100))

    if optfile is not None:
        stats2 = utils.load_pickle(optfile)
        #plot_rank_histograms(stats, stats2, test=False)
        plot_precision_at_k_histograms(stats,
                                       stats2,
                                       K=[1, 3, 5, 10],
                                       test=False)
    else:
        plot_rank_histogram(stats)
Beispiel #11
0
    def exceute(self):
        current_bet = self.initial_bet
        current_money = self.initial_money
        betting_history = []
        for index, odd in enumerate(self.odds):
            if not self._betting_condition(self.odds_ratio[index]):
                continue

            current_money = self._bet(current_money, current_bet)
            if current_money <= 0:
                return betting_history

            if self.result[index] == 'win':
                current_money += (current_bet * float(odd))
                current_bet = self._cal_bet_after_winning(current_bet)
            else:
                current_bet = self._cal_bet_after_lossing()

            if logging_settings.get('enable_single_run_logging', False):
                logger = configure_logger("betting", "DoubleBettingAfterWinning.csv")
                logger.info("Index:{}, current_money:{}, current_bet:{}, odds:{}".format(index, current_money,current_bet, odd))

            betting_history.append({'index':index, 'current_money':current_money, 'current_bet':current_bet})

        return betting_history
Beispiel #12
0
def run():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-file', required=True)
    parser.add_argument('--wsd-output-dir', required=True)
    parser.add_argument('--directory-to-write', required=True)
    parser.add_argument('--log-level', default='debug')
    args = parser.parse_args()

    utils.configure_logger(args.log_level)
    logger = utils.get_logger()
    logger.debug("Args: {}".format(args))

    merger = IMSOutputMerger()
    merger.merge(args.input_file, args.wsd_output_dir, args.directory_to_write)
    logger.info('Merge Done.')
Beispiel #13
0
 def __init__(self, dnsq, fut, clientip, logger=None):
     self.transport = None
     self.dnsq = dnsq
     self.fut = fut
     self.clientip = clientip
     if logger is None:
         logger = utils.configure_logger("DNSClientProtocol", "DEBUG")
     self.logger = logger
Beispiel #14
0
def run():
    sense_vocab = build_vocab('../datasets/senses.train.txt',
                              num_already_allocated_tokens=0)
    word_vocab = build_vocab('../datasets/sentences.train.txt')

    configure_logger()
    logger = get_logger()

    logger.info("{} {}".format(word_vocab.size, sense_vocab.size))

    train_iter = read_data(word_vocab, sense_vocab, data_path='../datasets/')
    disambiguator = NeuralDisambiguator(hidden_unit_size=25,
                                        learning_rate=0.001,
                                        num_senses=sense_vocab.size,
                                        vocab_size=word_vocab.size,
                                        embedding_length=50)
    disambiguator.fit(train_iter, max_steps=2000)
Beispiel #15
0
 def __init__(self, upstream_resolver, upstream_port, logger=None):
     self.loop = asyncio.get_event_loop()
     self.upstream_resolver = upstream_resolver
     self.upstream_port = upstream_port
     if logger is None:
         logger = utils.configure_logger("DNSClient", "DEBUG")
     self.logger = logger
     self.transport = None
Beispiel #16
0
def run(no_journeys, map_configs):

    overall_start_time, runstr = get_start_time()

    logger = configure_logger(runstr)

    journey_files, no_journeys, attempting_all = get_journey_files(no_journeys)

    base_layers = read_in_convert_base_maps(map_configs)

    clear_out_old_folders_and_make_new(map_configs)

    maps_dict = plot_base_map_layers(base_layers, map_configs)

    (
        start_time,
        journey_plots,
        counters,
        text_vars,
    ) = set_up_plot_lists_and_counters(journey_files)

    timestr, timestr_moving_recents, text_vars = make_first_frames(
        counters, journey_files, text_vars, maps_dict, map_configs)

    counters = make_all_other_frames(
        journey_files,
        attempting_all,
        no_journeys,
        start_time,
        maps_dict,
        runstr,
        text_vars,
        timestr,
        journey_plots,
        counters,
        map_configs,
    )

    make_final_by_year_image(runstr, counters, maps_dict, map_configs)

    additional_frames_journeys_fading_out(journey_files, maps_dict,
                                          journey_plots, counters, map_configs)

    make_all_videos(runstr, counters, map_configs)

    clear_out_images_for_video_folder(map_configs)

    overall_finish_time = datetime.datetime.now()

    overall_run_notes(
        runstr,
        attempting_all,
        no_journeys,
        overall_start_time,
        overall_finish_time,
        counters,
        map_configs,
    )
def run():
    configure_logger()
    logger = get_logger()

    dataset = DataSet('../datasets/wiki-new')

    FLAGS = {
        "embedding_length": 10,
        "min_counts": 10,
        "batch_size": 16,
        "hidden_unit_size": 10,
        "learning_rate": .001
    }

    disambiguator = NeuralDisambiguator(dataset,
                                        FLAGS,
                                        use_pretrained_embeddings=False)
    disambiguator.fit(max_steps=2000)
Beispiel #18
0
def main(args):
    # given program arguments, generate a config file
    dataset = S3DIS(args)

    # create a checkpoint directory
    model_dir = dataset.experiment_dir

    # configure logger
    utl.configure_logger(model_dir, args.loglevel.upper())

    # get Tensorboard writer object
    writer = utl.get_tensorboard_writer(log_dir=model_dir)

    dataset.config.dump_to_tensorboard(writer=writer)

    train(dataset=dataset, model_dir=model_dir, writer=writer)

    # close Tensorboard writer
    writer.close()
Beispiel #19
0
def run():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--input-dir', required=True)
    parser.add_argument('--directory-to-write', default='/tmp/ims-mt-data')
    parser.add_argument('--num-of-process', default=1, type=int)
    parser.add_argument('--log-level', default='debug')
    args = parser.parse_args()

    input_directory = args.input_dir
    out_directory = args.directory_to_write

    utils.configure_logger(args.log_level)
    logger = utils.get_logger()
    logger.debug("Args: {}".format(args))

    files = os.listdir(input_directory)
    files = [os.path.join(input_directory, f) for f in files]
    logger.info('total number of files: %d' % len(files))
    create_IMS_formatted_dataset(files, out_directory, args.num_of_process)
    logger.info('Done')
def run():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--filename', required=False, default='wikipages.txt')
    parser.add_argument('--num-process', help="Number of process for parallel processing", required=False, default=1,
                        type=int)
    parser.add_argument('--log-level', required=False, default="info")
    args = parser.parse_args()

    configure_logger(args.log_level)
    logger = utils.get_logger()

    logger.info("Input file: {}".format(args))

    directory = '../datasets/wiki/'
    try:
        os.mkdir(directory)
    except OSError:
        logger.debug("{} is already exist".format(directory))

    extract_from_file(args.filename, args.num_process)
Beispiel #21
0
def run():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--uwsd-dataset',
                        required=False,
                        default='../datasets/wiki')
    parser.add_argument(
        '--category-file',
        required=False,
        default='../datasets/wikipedia-miner/en_20090306/categorylink.csv')
    parser.add_argument(
        '--generality-file',
        required=False,
        default='../datasets/wikipedia-miner/en_20090306/generality.csv')
    parser.add_argument(
        '--pageid-title-file',
        required=False,
        default='../datasets/wikipedia-miner/en_20090306/page.csv')
    parser.add_argument('--num-process',
                        help="Number of process for parallel processing",
                        required=False,
                        default=1,
                        type=int)
    parser.add_argument('--log-level', required=False, default="info")
    args = parser.parse_args()

    configure_logger(args.log_level)
    logger = utils.get_logger()

    logger.info("Running.")

    files = sorted(glob.glob(os.path.abspath(args.uwsd_dataset) + "/*.tw.txt"))
    create_page_id_link_mapping_file(files, args.uwsd_dataset)
    # files = sorted(glob.glob(os.path.abspath(args.uwsd_dataset) + "/*.pageid.txt"))
    # get_categories_for_senses(files, args.category_file, args.pageid_title_file, args.generality_file)

    logger.info("Done")
Beispiel #22
0
def get_app(args):
    logger = utils.configure_logger("doh-httpproxy", args.level)
    app = DOHApplication(logger=logger, debug=args.debug)
    app.set_upstream_resolver(args.upstream_resolver, args.upstream_port)
    app.set_ecs(args.ecs)
    app.set_socket(args.socket)
    app.router.add_get(args.uri, doh1handler)
    app.router.add_post(args.uri, doh1handler)

    # Get trusted reverse proxies and format it for aiohttp_remotes setup
    #if len(args.trusted) == 0:
    #    x_forwarded_handling = aiohttp_remotes.XForwardedRelaxed()
    #    forwarded_handling = aiohttp_remotes.ForwardedRelaxed()
    #else:
    #    x_forwarded_handling = aiohttp_remotes.XForwardedStrict([args.trusted])
    #    forwarded_handling = aiohttp_remotes.ForwardedStrict([args.trusted])
    x_forwarded_handling = aiohttp_remotes.XForwardedRelaxed()
    forwarded_handling = aiohttp_remotes.ForwardedRelaxed()
    asyncio.ensure_future(
        aiohttp_remotes.setup(app, forwarded_handling, x_forwarded_handling))
    return app
def process(statsfile, k, optfile=None):
    stats = utils.load_pickle(statsfile)
    track_ar = average_rank_per_track(stats)
    clique_ar = average_rank_per_clique(stats)
    ma_p = mean_average_precision(stats)
    #k_p = average_precision(stats, k, ver=True)
    k_p = average_precision_at_k(stats, k)

    # Set up logger
    logger = utils.configure_logger()

    # print results
    logger.info("Number of queries: %d" % len(stats))
    logger.info("Average Rank per Track: %.3f" % track_ar)
    logger.info("Average Rank per Clique: %.3f" % clique_ar)
    logger.info("Mean Average Precision: %.2f %%" % (ma_p * 100))
    logger.info("Precision at %d: %.2f %%" % (k, k_p * 100))
    
    if optfile is not None:
        stats2 = utils.load_pickle(optfile)
        #plot_rank_histograms(stats, stats2, test=False) 
        plot_precision_at_k_histograms(stats, stats2, K=[1,3,5,10], test=False)
    else:
        plot_rank_histogram(stats)
Beispiel #24
0
def start_algorithm(initial_state_file=None,
                    config_file=None,
                    fixed_rounds=None,
                    start_clean=True):
    FUNCTION = 'main'
    '''
    Main function
    '''
    output_dir = "./output/"
    output_dir_log = output_dir + "ALGO_TRADING_LOG_{}.txt".format(
        utils.date_now_filename())
    output_dir_log_json = output_dir + "ALGO_TRADINGJSON_LOG_{}.txt".format(
        utils.date_now_filename())
    output_dir_plots = output_dir + "plots"
    output_dir_status = output_dir + "ALGO_STATUS_LOG_{}.txt".format(
        utils.date_now_filename())
    output_dir_archive = output_dir + "ALGO_ARCHIVE_LOG_{}.txt".format(
        utils.date_now_filename())
    output_dir_plotdata = output_dir + "ALGO_PLOTDATA_LOG_{}.txt".format(
        utils.date_now_filename())
    output_dir_overview = output_dir + "ALGO_OVERVIEW_LOG_{}.txt".format(
        utils.date_now_filename())
    ending_state_path = output_dir + "ALGO_ENDING_STATE_{}.txt".format(
        utils.date_now_filename())
    overview_plotdata_path = output_dir + "ALGO_OVERVIEWPLOTDATA_LOG_{}.txt".format(
        utils.date_now_filename())

    # Clean output directory
    utils.clean_output(output_dir, output_dir_plots)

    # Get config params
    if not config_file:
        config_file = args.config_file
    config_params = utils.read_config(config_file)

    # Configure logging
    logger = utils.configure_logger("default", output_dir_log,
                                    config_params["logging"])

    logger.info("Starting algorithm", extra={'function': FUNCTION})

    # Initialize stocks object with configuration values
    if not initial_state_file:
        initial_state_file = args.initial_state_file

    logger.debug("Reading initial values from config file: {}...".format(
        initial_state_file),
                 extra={'function': FUNCTION})
    init_val = utils.read_json_data(initial_state_file, logger=logger)

    stocks = Stocks(balance=init_val["balance"],
                    bought_stocks=init_val["bought_stocks"],
                    monitored_stocks=init_val["monitored_stocks"],
                    current_status=init_val["current_status"],
                    monitored_stock_data=init_val["monitored_stock_data"],
                    archive=init_val["archive"],
                    interesting_stocks=init_val["interesting_stocks"],
                    not_interesting_stocks=init_val["not_interesting_stocks"],
                    yahoo_calls=init_val["yahoo_calls"],
                    results=init_val["results"])

    # Initialize status files
    update_state(stocks, logger, output_dir_log, output_dir_overview,
                 output_dir_status, output_dir_archive, output_dir_plotdata,
                 output_dir_log_json, ending_state_path,
                 overview_plotdata_path)

    # Check which stocks to monitor
    if start_clean:
        logger.info("Getting and initializing list of stocks to monitor...",
                    extra={'function': FUNCTION})
        stocks.initialize_stocks(date=datetime.now(),
                                 logger=logger,
                                 config_params=config_params,
                                 update_nasdaq_file=False)

    # Set initial values
    stock_market_open = True
    archive_session = False
    counter = 0

    while stock_market_open:
        # Update config params
        if not config_file:
            config_file = args.config_file
        config_params = utils.read_config(config_file)

        # Read and save whether the user has ordered manually to sell a certain stock, and if true, sell it
        logger.info(
            "Checking whether user has ordered to buy or sell stocks...",
            extra={'function': FUNCTION})
        commands_log = utils.get_latest_log("COMMANDS", logger=logger)
        commands = {
            'commands': [],
            'tickers_to_sell': [],
            'tickers_to_stop_monitor': []
        }
        if commands_log:
            commands = utils.read_commands(commands_log, logger=logger)
            stocks.hard_sell_check(commands, commands_log, config_params,
                                   logger)
            stocks.check_to_stop_monitor_stocks(commands, commands_log,
                                                config_params, logger)

        # Loop through monitored stocks
        logger.info("Checking monitored stocks...",
                    extra={'function': FUNCTION})
        for stock in stocks.monitored_stocks:
            stocks.check_monitored_stock(stock,
                                         config_params=config_params,
                                         logger=logger)

        # Check if we should monitor more stocks
        if config_params['main']['check_for_new_stocks']:
            logger.info("Checking if we should monitor more stocks...",
                        extra={'function': FUNCTION})
            stocks.check_to_monitor_new_stocks(datetime.now(), config_params,
                                               logger)

        # Plot data per monitored stock
        if config_params['main']['plot_data']:
            logger.info("Plotting monitored stock data...",
                        extra={'function': FUNCTION})
            stocks.plot_monitored_stock_data(output_dir_plots, logger=logger)

        # Check to terminate algorithm
        if fixed_rounds:
            counter += 1
            if counter >= fixed_rounds:
                logger.info(
                    "Terminating algorithm because of configured fixed rounds",
                    extra={'function': FUNCTION})
                archive_session = True
                break
        elif config_params['main'][
                'sell_all_before_finish'] and utils.before_close():
            logger.info(
                "Terminating algorithm and selling all owned stocks because it was configured by the user",
                extra={'function': FUNCTION})
            archive_session = True
            stocks.hard_sell_check({"tickers_to_sell": ["ALLSTOCKS"]},
                                   commands_log, config_params, logger)
            break
        elif "STOPALGORITHM" in commands['commands']:
            logger.info(
                "Terminating algorithm because it was instructed by the user",
                extra={'function': FUNCTION})
            archive_session = True
            commands['commands'].remove("STOPALGORITHM")
            utils.write_json(commands, commands_log, logger=logger)
            if config_params['main']['sell_all_before_finish']:
                stocks.hard_sell_check({"tickers_to_sell": ["ALLSTOCKS"]},
                                       commands_log, config_params, logger)
            break
        else:
            scraper = YahooScraper()
            if scraper.all_markets_closed(
                    stocks.monitored_stocks, config_params, logger
            ) and not config_params['main']['ignore_market_hours']:
                logger.info(
                    "Terminating algorithm because all relevant markets are closed",
                    extra={'function': FUNCTION})
                break

        # Update state
        update_state(stocks, logger, output_dir_log, output_dir_overview,
                     output_dir_status, output_dir_archive,
                     output_dir_plotdata, output_dir_log_json,
                     ending_state_path, overview_plotdata_path)

        # Sleep
        seconds_to_sleep = config_params['main']['seconds_to_sleep']
        logger.info("Sleeping {} seconds".format(seconds_to_sleep),
                    extra={'function': FUNCTION})
        time.sleep(seconds_to_sleep)

    # Perform final operations before terminating
    stocks.current_status = utils.close_markets(stocks.current_status)
    update_state(stocks, logger, output_dir_log, output_dir_overview,
                 output_dir_status, output_dir_archive, output_dir_plotdata,
                 output_dir_log_json, ending_state_path,
                 overview_plotdata_path)
    if archive_session:
        transactions_file = utils.get_latest_log("ARCHIVE", logger=logger)
        status_file = utils.get_latest_log("STATUS", logger=logger)
        overview_file = utils.get_latest_log("OVERVIEW", logger=logger)
        utils.archive_session([transactions_file, status_file, overview_file],
                              logger=logger)
        stocks.archive = []
        update_state(stocks, logger, output_dir_log, output_dir_overview,
                     output_dir_status, output_dir_archive,
                     output_dir_plotdata, output_dir_log_json,
                     ending_state_path, overview_plotdata_path)

    return True
# local stuff
import hdf5_getters as GETTERS
import dan_tools
import time
import utils
import scipy.cluster.vq as vq
import pylab as plt
from transforms import load_transform
import analyze_stats as anst

# params, for ICMR paper: 75 and 1.96
WIN = 75
PATCH_LEN = WIN*12

# Set up logger
logger = utils.configure_logger()

# Global models
lda = None
pca = None

def compute_codes_orig_it(track_ids, maindir, clique_ids, start_idx, end_idx):
    """Computes the original features, based on Thierry and Ellis, 2012.
    Dimensionality reduction using PCA of 50, 100, and 200 components."""
    res = []
    trainedpca = utils.load_pickle("models/pca_250Kexamples_900dim_nocovers.pkl")
    pca_components = [50,100,200]

    # Init codes
    codes = []
    for n_comp in pca_components:
Beispiel #26
0
            sleep(60)
            return run(funcname, args=args, kwargs=kwargs, retries=retries + 1)
        else:
            logger.error("Server refused")
            return False
    except EOFError:
        logger.error('Internal server error')
        return False
    except Exception:
        print_exc_plus()


if __name__ == '__main__':
    import argparse
    from utils import configure_logger
    configure_logger(logger)

    def print_log(debug=False):
        if debug:
            os.system(f'tail -n 43 -f \"{MAIN_LOGFILE}\"')
        else:
            os.system(f'tail -n 43 -f \"{CONSOLE_LOGFILE}\"')

    try:
        actions = {
            'info': 'show buildbot info',
            'update': '[--overwrite] update pushed files to the repo',
            'clean': '[dir / all] checkout pkgbuilds in packages',
            'rebuild': '[dir1 dir2 --clean] rebuild packages',
            'log': '[--debug] print log',
            'upload': '[dir1 dir2 --overwrite] force upload packages',
import time

# local stuff
import pca
import hdf5_getters as GETTERS
import dan_tools
import utils
from transforms import load_transform

# Thierry's original parameters for ISMIR paper
WIN = 75
PWR = 1.96
PATCH_LEN = WIN * 12

# Set up logger
logger = utils.configure_logger()


def extract_feats(filename, td=None, lda_file=None, lda_n=0, ver=True):
    """Computes the features using the dictionary transformation td. 
        If it doesn't exist, computes them using Thierry's method.

     The improved pipeline is composed of 11 steps:

        1.- Beat Synchronous Chroma
        2.- L2-Norm
        3.- Shingle (PATCH_LEN: 75 x 12)
        4.- 2D-FFT
        5.- L2-Norm
        6.- Log-Scale
        7.- Sparse Coding
Beispiel #28
0
import os
import pathlib
import sys

import requests
from google.api_core import exceptions
from google.cloud import datastore
from google.cloud import vision
from PIL import Image, ImageDraw

import utils
from flickr_to_datastore import write_entities_to_datastore

### LOGGING ####################################################################
logger = logging.getLogger(__name__)
utils.configure_logger(logger, console_output=True)
################################################################################


def pull_unclassified_entities(ds_client):
    # TODO: Update docstring
    """Retrieves entities from datastore that have no value for vision_labels.

    Args:
        ds_client (google.cloud.datastore.client.Client)

    Returns:
        list of google.cloud.datastore.entity.Entity of kind 'Photo'
    """
    query = ds_client.query(kind="Photo")
    query.add_filter("is_classified", "=", False)
"""
    # Configure a connection to the database at the URL specified by the
    # DATABASE_URL environment variable.
    # Remember that we're using `echo=True` so we can see all generated SQL.
    engine = sqlalchemy.create_engine(os.environ['DATABASE_URL'], echo=True)

    # Create a session factory. Calling `Session()` will create new SQLAlchemy
    # ORM sessions.
    Session = orm.sessionmaker(bind=engine)

    # Create a new session which we'll use for the following investigation.
    session = Session()
"""

with step():
    configure_logger()
    engine = sqlalchemy.create_engine(os.environ['DATABASE_URL'])
    Session = orm.sessionmaker(bind=engine)
    session = Session()

with step():
    q = session.query(Zebra).filter(Zebra.when_born <= now() - 2 * YEAR)
    old_zebra = q.first()

with step():
    wh = session.query(WateringHole).first()

with step():
    session.rollback()

# Commit instead
def main():
    args = get_args()
    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu)

    cur_timestamp = str(datetime.now())[:-3]  # we also include ms to prevent the probability of name collision
    model_width = {'linear': '', 'cnn': args.n_filters_cnn, 'lenet': '', 'resnet18': ''}[args.model]
    model_str = '{}{}'.format(args.model, model_width)
    model_name = '{} dataset={} model={} eps={} attack={} m={} attack_init={} fgsm_alpha={} epochs={} pgd={}-{} grad_align_cos_lambda={} lr_max={} seed={}'.format(
        cur_timestamp, args.dataset, model_str, args.eps, args.attack, args.minibatch_replay, args.attack_init, args.fgsm_alpha, args.epochs,
        args.pgd_alpha_train, args.pgd_train_n_iters, args.grad_align_cos_lambda, args.lr_max, args.seed)
    if not os.path.exists('models'):
        os.makedirs('models')
    logger = utils.configure_logger(model_name, args.debug)
    logger.info(args)
    half_prec = args.half_prec
    n_cls = 2 if 'binary' in args.dataset else 10

    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)

    double_bp = True if args.grad_align_cos_lambda > 0 else False
    n_eval_every_k_iter = args.n_eval_every_k_iter
    args.pgd_alpha = args.eps / 4

    eps, pgd_alpha, pgd_alpha_train = args.eps / 255, args.pgd_alpha / 255, args.pgd_alpha_train / 255
    train_data_augm = False if args.dataset in ['mnist'] else True
    train_batches = data.get_loaders(args.dataset, -1, args.batch_size, train_set=True, shuffle=True, data_augm=train_data_augm)
    train_batches_fast = data.get_loaders(args.dataset, n_eval_every_k_iter, args.batch_size, train_set=True, shuffle=False, data_augm=False)
    test_batches = data.get_loaders(args.dataset, args.n_final_eval, args.batch_size_eval, train_set=False, shuffle=False, data_augm=False)
    test_batches_fast = data.get_loaders(args.dataset, n_eval_every_k_iter, args.batch_size_eval, train_set=False, shuffle=False, data_augm=False)

    model = models.get_model(args.model, n_cls, half_prec, data.shapes_dict[args.dataset], args.n_filters_cnn).cuda()
    model.apply(utils.initialize_weights)
    model.train()

    if args.model == 'resnet18':
        opt = torch.optim.SGD(model.parameters(), lr=args.lr_max, momentum=0.9, weight_decay=args.weight_decay)
    elif args.model == 'cnn':
        opt = torch.optim.Adam(model.parameters(), lr=args.lr_max, weight_decay=args.weight_decay)
    elif args.model == 'lenet':
        opt = torch.optim.Adam(model.parameters(), lr=args.lr_max, weight_decay=args.weight_decay)
    else:
        raise ValueError('decide about the right optimizer for the new model')

    if half_prec:
        if double_bp:
            amp.register_float_function(torch, 'batch_norm')
        model, opt = amp.initialize(model, opt, opt_level="O1")

    if args.attack == 'fgsm':  # needed here only for Free-AT
        delta = torch.zeros(args.batch_size, *data.shapes_dict[args.dataset][1:]).cuda()
        delta.requires_grad = True

    lr_schedule = utils.get_lr_schedule(args.lr_schedule, args.epochs, args.lr_max)
    loss_function = nn.CrossEntropyLoss()

    train_acc_pgd_best, best_state_dict = 0.0, copy.deepcopy(model.state_dict())
    start_time = time.time()
    time_train, iteration, best_iteration = 0, 0, 0
    for epoch in range(args.epochs + 1):
        train_loss, train_reg, train_acc, train_n, grad_norm_x, avg_delta_l2 = 0, 0, 0, 0, 0, 0
        for i, (X, y) in enumerate(train_batches):
            if i % args.minibatch_replay != 0 and i > 0:  # take new inputs only each `minibatch_replay` iterations
                X, y = X_prev, y_prev
            time_start_iter = time.time()
            # epoch=0 runs only for one iteration (to check the training stats at init)
            if epoch == 0 and i > 0:
                break
            X, y = X.cuda(), y.cuda()
            lr = lr_schedule(epoch - 1 + (i + 1) / len(train_batches))  # epoch - 1 since the 0th epoch is skipped
            opt.param_groups[0].update(lr=lr)

            if args.attack in ['pgd', 'pgd_corner']:
                pgd_rs = True if args.attack_init == 'random' else False
                n_eps_warmup_epochs = 5
                n_iterations_max_eps = n_eps_warmup_epochs * data.shapes_dict[args.dataset][0] // args.batch_size
                eps_pgd_train = min(iteration / n_iterations_max_eps * eps, eps) if args.dataset == 'svhn' else eps
                delta = utils.attack_pgd_training(
                    model, X, y, eps_pgd_train, pgd_alpha_train, opt, half_prec, args.pgd_train_n_iters, rs=pgd_rs)
                if args.attack == 'pgd_corner':
                    delta = eps * utils.sign(delta)  # project to the corners
                    delta = clamp(X + delta, 0, 1) - X

            elif args.attack == 'fgsm':
                if args.minibatch_replay == 1:
                    if args.attack_init == 'zero':
                        delta = torch.zeros_like(X, requires_grad=True)
                    elif args.attack_init == 'random':
                        delta = utils.get_uniform_delta(X.shape, eps, requires_grad=True)
                    else:
                        raise ValueError('wrong args.attack_init')
                else:  # if Free-AT, we just reuse the existing delta from the previous iteration
                    delta.requires_grad = True

                X_adv = clamp(X + delta, 0, 1)
                output = model(X_adv)
                loss = F.cross_entropy(output, y)
                if half_prec:
                    with amp.scale_loss(loss, opt) as scaled_loss:
                        grad = torch.autograd.grad(scaled_loss, delta, create_graph=True if double_bp else False)[0]
                        grad /= scaled_loss / loss  # reverse back the scaling
                else:
                    grad = torch.autograd.grad(loss, delta, create_graph=True if double_bp else False)[0]

                grad = grad.detach()

                argmax_delta = eps * utils.sign(grad)

                n_alpha_warmup_epochs = 5
                n_iterations_max_alpha = n_alpha_warmup_epochs * data.shapes_dict[args.dataset][0] // args.batch_size
                fgsm_alpha = min(iteration / n_iterations_max_alpha * args.fgsm_alpha, args.fgsm_alpha) if args.dataset == 'svhn' else args.fgsm_alpha
                delta.data = clamp(delta.data + fgsm_alpha * argmax_delta, -eps, eps)
                delta.data = clamp(X + delta.data, 0, 1) - X

            elif args.attack == 'random_corner':
                delta = utils.get_uniform_delta(X.shape, eps, requires_grad=False)
                delta = eps * utils.sign(delta)

            elif args.attack == 'none':
                delta = torch.zeros_like(X, requires_grad=False)
            else:
                raise ValueError('wrong args.attack')

            # extra FP+BP to calculate the gradient to monitor it
            if args.attack in ['none', 'random_corner', 'pgd', 'pgd_corner']:
                grad = get_input_grad(model, X, y, opt, eps, half_prec, delta_init='none',
                                      backprop=args.grad_align_cos_lambda != 0.0)

            delta = delta.detach()

            output = model(X + delta)
            loss = loss_function(output, y)

            reg = torch.zeros(1).cuda()[0]  # for .item() to run correctly
            if args.grad_align_cos_lambda != 0.0:
                grad2 = get_input_grad(model, X, y, opt, eps, half_prec, delta_init='random_uniform', backprop=True)
                grads_nnz_idx = ((grad**2).sum([1, 2, 3])**0.5 != 0) * ((grad2**2).sum([1, 2, 3])**0.5 != 0)
                grad1, grad2 = grad[grads_nnz_idx], grad2[grads_nnz_idx]
                grad1_norms, grad2_norms = l2_norm_batch(grad1), l2_norm_batch(grad2)
                grad1_normalized = grad1 / grad1_norms[:, None, None, None]
                grad2_normalized = grad2 / grad2_norms[:, None, None, None]
                cos = torch.sum(grad1_normalized * grad2_normalized, (1, 2, 3))
                reg += args.grad_align_cos_lambda * (1.0 - cos.mean())

            loss += reg

            if epoch != 0:
                opt.zero_grad()
                utils.backward(loss, opt, half_prec)
                opt.step()

            time_train += time.time() - time_start_iter
            train_loss += loss.item() * y.size(0)
            train_reg += reg.item() * y.size(0)
            train_acc += (output.max(1)[1] == y).sum().item()
            train_n += y.size(0)

            with torch.no_grad():  # no grad for the stats
                grad_norm_x += l2_norm_batch(grad).sum().item()
                delta_final = clamp(X + delta, 0, 1) - X  # we should measure delta after the projection onto [0, 1]^d
                avg_delta_l2 += ((delta_final ** 2).sum([1, 2, 3]) ** 0.5).sum().item()

            if iteration % args.eval_iter_freq == 0:
                train_loss, train_reg = train_loss / train_n, train_reg / train_n
                train_acc, avg_delta_l2 = train_acc / train_n, avg_delta_l2 / train_n

                # it'd be incorrect to recalculate the BN stats on the test sets and for clean / adversarial points
                utils.model_eval(model, half_prec)

                test_acc_clean, _, _ = rob_acc(test_batches_fast, model, eps, pgd_alpha, opt, half_prec, 0, 1)
                test_acc_fgsm, test_loss_fgsm, fgsm_deltas = rob_acc(test_batches_fast, model, eps, eps, opt, half_prec, 1, 1, rs=False)
                test_acc_pgd, test_loss_pgd, pgd_deltas = rob_acc(test_batches_fast, model, eps, pgd_alpha, opt, half_prec, args.attack_iters, 1)
                cos_fgsm_pgd = utils.avg_cos_np(fgsm_deltas, pgd_deltas)
                train_acc_pgd, _, _ = rob_acc(train_batches_fast, model, eps, pgd_alpha, opt, half_prec, args.attack_iters, 1)  # needed for early stopping

                grad_x = utils.get_grad_np(model, test_batches_fast, eps, opt, half_prec, rs=False)
                grad_eta = utils.get_grad_np(model, test_batches_fast, eps, opt, half_prec, rs=True)
                cos_x_eta = utils.avg_cos_np(grad_x, grad_eta)

                time_elapsed = time.time() - start_time
                train_str = '[train] loss {:.3f}, reg {:.3f}, acc {:.2%} acc_pgd {:.2%}'.format(train_loss, train_reg, train_acc, train_acc_pgd)
                test_str = '[test] acc_clean {:.2%}, acc_fgsm {:.2%}, acc_pgd {:.2%}, cos_x_eta {:.3}, cos_fgsm_pgd {:.3}'.format(
                    test_acc_clean, test_acc_fgsm, test_acc_pgd, cos_x_eta, cos_fgsm_pgd)
                logger.info('{}-{}: {}  {} ({:.2f}m, {:.2f}m)'.format(epoch, iteration, train_str, test_str,
                                                                      time_train/60, time_elapsed/60))

                if train_acc_pgd > train_acc_pgd_best:  # catastrophic overfitting can be detected on the training set
                    best_state_dict = copy.deepcopy(model.state_dict())
                    train_acc_pgd_best, best_iteration = train_acc_pgd, iteration

                utils.model_train(model, half_prec)
                train_loss, train_reg, train_acc, train_n, grad_norm_x, avg_delta_l2 = 0, 0, 0, 0, 0, 0

            iteration += 1
            X_prev, y_prev = X.clone(), y.clone()  # needed for Free-AT

        if epoch == args.epochs:
            torch.save({'last': model.state_dict(), 'best': best_state_dict}, 'models/{} epoch={}.pth'.format(model_name, epoch))
            # disable global conversion to fp16 from amp.initialize() (https://github.com/NVIDIA/apex/issues/567)
            context_manager = amp.disable_casts() if half_prec else utils.nullcontext()
            with context_manager:
                last_state_dict = copy.deepcopy(model.state_dict())
                half_prec = False  # final eval is always in fp32
                model.load_state_dict(last_state_dict)
                utils.model_eval(model, half_prec)
                opt = torch.optim.SGD(model.parameters(), lr=0)

                attack_iters, n_restarts = (50, 10) if not args.debug else (10, 3)
                test_acc_clean, _, _ = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, 0, 1)
                test_acc_pgd_rr, _, deltas_pgd_rr = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, attack_iters, n_restarts)
                logger.info('[last: test on 10k points] acc_clean {:.2%}, pgd_rr {:.2%}'.format(test_acc_clean, test_acc_pgd_rr))

                if args.eval_early_stopped_model:
                    model.load_state_dict(best_state_dict)
                    utils.model_eval(model, half_prec)
                    test_acc_clean, _, _ = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, 0, 1)
                    test_acc_pgd_rr, _, deltas_pgd_rr = rob_acc(test_batches, model, eps, pgd_alpha, opt, half_prec, attack_iters, n_restarts)
                    logger.info('[best: test on 10k points][iter={}] acc_clean {:.2%}, pgd_rr {:.2%}'.format(
                        best_iteration, test_acc_clean, test_acc_pgd_rr))

        utils.model_train(model, half_prec)

    logger.info('Done in {:.2f}m'.format((time.time() - start_time) / 60))
Beispiel #31
0
                        help="Path to a file with urls to load into database",
                        type=str)
    parser.add_argument("-w",
                        "--workers",
                        help="Start given number of workers",
                        type=int)
    parser.add_argument("-t",
                        "--threads",
                        help="Start given number of threaded workers",
                        type=int)
    parser.add_argument("-d",
                        "--debug",
                        help="Enable debug logging in workers",
                        action="store_true")
    args = parser.parse_args()
    utils.configure_logger(args.debug)

    if args.workers is not None and args.threads is not None:
        print(
            "You couldn't start both threaded and process based workers simultaneously"
        )
        exit(1)

    if args.stop:
        kill_workers()

    if args.erase:
        erase_database()

    if args.load:
        load_urls_to_database(args.load)
Beispiel #32
0
            logger.info("repo-remove: %s", repo_remove(remove_pkgs))
        else:
            logger.warning(f'Nothing to remove in {arch}')
    archive_dir = Path('archive')
    for fpath in archive_dir.iterdir():
        nosigname = fpath.name[:-4] if fpath.name.endswith(
            '.sig') else fpath.name
        if nosigname.endswith(PKG_SUFFIX) and \
            get_pkg_details_from_name(nosigname).pkgname in pkgnames:
            throw_away(fpath)
    logger.info('finished remove')
    return True


if __name__ == '__main__':
    configure_logger(logger, logfile='repo.log', rotate_size=1024 * 1024 * 10)
    import argparse
    try:
        parser = argparse.ArgumentParser(
            description='Automatic management tool for an arch repo.')
        parser.add_argument(
            '-a',
            '--arch',
            nargs='?',
            default=False,
            help='arch to regenerate, split by comma, defaults to all')
        parser.add_argument('-o',
                            '--overwrite',
                            action='store_true',
                            help='overwrite when updating existing packages')
        parser.add_argument(
Beispiel #33
0
    def __init__(self, start, number_of_stocks, sell_criterium, stocks=[]):
        if not stocks:
            stocks = []

        super().__init__(balance=[10000, 10000],
                         bought_stocks={},
                         monitored_stocks=[],
                         monitored_stock_data={},
                         archive=[],
                         current_status={},
                         interesting_stocks=[],
                         not_interesting_stocks=[],
                         yahoo_calls={},
                         results={})

        if isinstance(start, str):
            start = datetime.strptime(start, '%Y/%m/%d-%H:%M:%S')
        self.start = start

        self.ip = "192.168.0.14"
        self.M = 500
        self.Pavg = 20

        self.sell_criterium = sell_criterium

        self.indicators = Indicators()

        self.conf = utils.read_config("./config/config.json")
        self.logger = utils.configure_logger("default", "./GLENNY_LOG.txt",
                                             self.conf["logging"])
        self.initialize_stocks(start,
                               self.logger,
                               self.conf,
                               number_of_stocks,
                               update_nasdaq_file=False,
                               stocks=stocks)

        self.results = {
            "stock": [],
            "bought": [],
            "price_bought": [],
            "number": [],
            "result": [],
            "start_date": [],
            "comment": [],
            "timestamp": [],
            "sell_criterium": [],
            "first_sold": [],
            "first_Pe": [],
            "first_N": [],
            "second_sold": [],
            "second_Pe": [],
            "second_N": [],
            "time_diff_bod": [],
            "time_diff_eod": [],
            'der_bigEMA': []
        }

        self.columns = [
            "timestamp", "stock", "result", "comment", "start_date", "bought",
            "first_sold", "second_sold", "price_bought", "first_Pe",
            "second_Pe", "number", "first_N", "second_N", "time_diff_bod",
            "der_bigEMA", "sell_criterium"
        ]

        self.stats = {
            "param": [],
            "type": [],
            "total_result_plot": [],
            "individual_result_plot": []
        }
        self.columns_stats = [
            'param', 'type', 'total_result_plot', 'individual_result_plot'
        ]

        self.csv_file = "./backtesting/backtesting_cumulative.csv"
        self.csv_file_stats = "./backtesting/backtesting_stats.csv"
        self.plot_dir = "./backtesting/back_plots/"
        self.stats_plot_dir = "./backtesting/stats_plots/"
        self.callsYQL_file = "./backtesting/calls_yql.json"
Beispiel #34
0
def server(args):
    """
    Server function to drive the submission process.  Two main modes
    of operation are present.  First, user submissions can be directly
    submitted with the -b flag.  This is mainly used for debugging.
    The main mode of operation is without the -b flag, where the server
    will check the database for jobs that haven't been submitted and call
    submission_script_manager for each.

    Inputs:
    ------
    args - argparse arguments object that contains the database
    configuration instructions as well as other options.

    Returns:
    --------
    Nothing. For a more verbose output, use --debug=2 at the
    runtime.

    """

    logger = utils.configure_logger(args)
    db_conn, sql = setup_database(args)

    if args.UserSubmissionID != 'none':
        if update_tables.count_user_submission_id(args.UserSubmissionID,
                                                  sql) > 0:
            #if args.submit:
            logger.debug('Processing {}'.format(args.UserSubmissionID))

            #Need to remove any whitespace
            USID = args.UserSubmissionID.replace(" ", "")

            submission_script_manager.process_jobs(args, args.UserSubmissionID,
                                                   db_conn, sql)
        #else:
        #print("-s option not selected, not submitting jobs through submission_script_manager")
        else:
            print(
                "The selected UserSubmission (UserSubmissionID = {0}) does not exist, exiting"
                .format(args.UserSubmissionID))
            exit()

    # No UserSubmissionID specified, send all
    # that haven't been sent already.
    else:
        user_submissions = database.get_unsubmitted_jobs(sql)
        logger.debug('Found unsubmitted jobs: {}'.format(user_submissions))

        if len(user_submissions) == 0:
            print(
                "There are no UserSubmissions which have not yet been submitted to a farm"
            )

        else:
            for i, submission_id in enumerate(user_submissions):
                logger.debug(
                    'Working on job {} of {}, user_submission_id = {}'.format(
                        i + 1, len(user_submissions), submission_id))
                submission_script_manager.process_jobs(args, submission_id,
                                                       db_conn, sql)

    # Shutdown the database connection, we're done here.
    db_conn.close()
Beispiel #35
0
def train_model(language, data_dir, model_type=1, gru=3, num_epochs=100, mini_batch_size=32, iterlog=20, cp_freq=1000, restore_path=None, model_root = './trained-models/', multi_gpu=False):

    # Check language is supported
    if not check_language_code(language):
        raise ValueError("Invalid or not supported language code!")

    # Check description file exists
    if not os.path.exists(data_dir):
        raise ValueError("Description file does not exist!'")

    # Check valid model is selected
    if model_type == 1:
        from models import model_conv1_gru as model
    elif model_type == 2:
        from models import model_conv2_gru as model
    else:
        raise ValueError("No valid model selected!")

    # Create model directories
    model_name = model.__name__ + str(gru)
    model_dir = os.path.join(model_root, model_name)
    if multi_gpu:
        my_gpu_rank = hvd.local_rank()
        num_gpus = hvd.size()
    else:
        my_gpu_rank = 0
        num_gpus = 1

    if not multi_gpu or my_gpu_rank == 0:
        if not os.path.exists(model_root):
            os.makedirs(model_root)
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

    # Configure logging
    configure_logger(logFileName=os.path.join(model_root, 'training.log'))

    print('Loading data...')
    # Load char_map, index_map and gets number of classes
    char_map, index_map, nb_classes = get_language_chars(language)

    # Prepare the data generator. Load the JSON file that contains the dataset
    datagen = DataGenerator(char_map=char_map, multi_gpu=multi_gpu)
    # Loads data limited with max duration. returns number of iterations.
    steps_per_epoch = datagen.load_data(data_dir, minibatch_size=mini_batch_size, max_duration=20.0)
    print('Building Model...')
    
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Create input placeholders for CTC Cost feeding and decoding feeding
    with tf.name_scope('inputs'):
        # Audio inputs have size of [batch_size, max_stepsize, num_features]. But the batch_size and max_stepsize can vary along each step
        # inputs = tf.placeholder(tf.float32, [None, None, 161], name='inputs') # spectrogram version
        inputs = tf.placeholder(tf.float32, [None, None, 40], name='inputs') # filterbank version. 40 shows number of filters.s
        # inputs = tf.placeholder(tf.float32, [None, None, 12], name='inputs') # mfcc version. 12 shows number of ceps.
        # 1d array of size [batch_size]
        seq_len = tf.placeholder(tf.int32, [None], name='seq_len')
        # We define the placeholder for the labels. Here we use sparse_placeholder that will generate a SparseTensor required by ctc_loss op.
        targets = tf.sparse_placeholder(tf.int32, name='targets')

    # Create model layers
    logits = model(inputs, nb_classes, gru)
    logits = tf.transpose(logits, perm=[1, 0, 2])

    # Compute the CTC loss using either TensorFlow's "ctc_loss". Then calculate the average loss across the batch
    with tf.name_scope('loss'):
        total_loss =  tf.nn.ctc_loss(inputs=logits, labels=targets, sequence_length=seq_len, ignore_longer_outputs_than_inputs=True)
        avg_loss = tf.reduce_mean(total_loss, name="Mean")

    # Adam Optimizer has preferred for the performance reasons to optimize the weights
    with tf.name_scope('train'):
        optimizer = tf.train.AdamOptimizer(learning_rate=0.0001, beta1=0.9, beta2=0.999, epsilon=1e-8)
        if multi_gpu:
            optimizer = hvd.DistributedOptimizer(optimizer)
        train_op = optimizer.minimize(avg_loss, global_step=global_step)
        # optimizer = tf.train.MomentumOptimizer(learning_rate= 2e-4, momentum=0.99, use_nesterov=True).minimize(avg_loss)

    # Beam search decodes the mini-batch
    with tf.name_scope('decoder'):
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, seq_len, beam_width=100, top_paths=1, merge_repeated=False)
        # Option 2: tf.nn.ctc_greedy_decoder (it's faster but give worse results)
        dense_decoded = tf.sparse_tensor_to_dense(decoded[0], name="SparseToDense", default_value=-1)

    # The Levenshtein (edit) distances between the decodings and their transcriptions "distance"
    with tf.name_scope('distance'):
        distance = tf.edit_distance(tf.cast(decoded[0], tf.int32), targets, name="edit_distance")
        # The accuracy of the outcome averaged over the whole batch ``accuracy`
        ler = tf.reduce_mean(distance, name="Mean")

    config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=False)
    init = tf.global_variables_initializer()
    total_steps  = num_epochs * steps_per_epoch
    if multi_gpu:
        config.gpu_options.allow_growth = False
        config.gpu_options.visible_device_list = str(my_gpu_rank)
        bcast = hvd.broadcast_global_variables(0)
        # Normally, we would divide the num_steps by hvd.size() 
        # But our DataGen has already done that, so we don't need it here
        # num_steps  = num_steps // hdv.size() + 1
    else:
        bcast = None
    print('Training...')
    iterator = None
    best_cost = 1e10
    lbest_cost = best_cost
    saver = tf.train.Saver(max_to_keep=50)
    session = tf.Session(config=config)
    with session.as_default():
        history_file = os.path.join(model_dir, 'history.log')
        if os.path.exists(history_file):
            entries = codecs.open(history_file, 'r', 'utf-8').readlines()
            last_entry = entries[-1].strip()
            _, h_epoch, h_epoch_step, h_loss, h_best_cost, h_cgs, h_ckptfile = last_entry.split('|')
            saver.restore(session, save_path=tf.train.latest_checkpoint(model_dir))
            epoch = int(h_epoch)
            current_epoch_step = int(h_epoch_step) + 1
            current_global_step = int(h_cgs) + 1
            remaining_epoch_steps = max(0, steps_per_epoch - current_epoch_step)
            best_cost = float(h_best_cost)
        else:
            epoch = 1
            current_epoch_step = 1
            current_global_step = 1
            remaining_epoch_steps = steps_per_epoch
            init.run()
            if bcast is not None:
                bcast.run()
        tf.get_default_graph().finalize()
        while epoch <= num_epochs:
            if epoch == 1:
                iterator = datagen.iterate_train(mini_batch_size=mini_batch_size, sort_by_duration=True, shuffle=False, max_iters=remaining_epoch_steps)
            else:
                iterator = datagen.iterate_train(mini_batch_size=mini_batch_size, sort_by_duration=False, shuffle=True, max_iters=remaining_epoch_steps)
            while current_epoch_step < steps_per_epoch:
                b_perc = int(float(current_epoch_step) / float(steps_per_epoch) * 100.0)
                inputs, out_len, indices, values, shape, labels = next(iterator)
                feed = {"inputs/inputs:0": inputs, "inputs/targets/shape:0": shape, "inputs/targets/indices:0": indices, "inputs/targets/values:0": values, "inputs/seq_len:0": out_len}
                step_start_time = time.time()
                if current_global_step % iterlog == 0:
                    _, ctc_cost, cError, cDecoded = session.run([train_op, avg_loss, ler, dense_decoded], feed_dict=feed)
                    step_end_time = time.time()
                    batch_error = cError * mini_batch_size
                    if not multi_gpu or my_gpu_rank == 0:
                        for i, seq in enumerate(cDecoded):
                            seq = [s for s in seq if s != -1]
                            sequence = convert_int_sequence_to_text_sequence(seq, index_map)
                            logger.info("IT      : {}-{}".format(current_global_step, str(i + 1)))
                            logger.info("OT ({:3d}): {}".format(len(labels[i]), labels[i]))
                            logger.info("DT ({:3d}): {}".format(len(sequence), sequence))
                            logger.info('-' * 100)
                else:
                    ctc_cost, _ = session.run([avg_loss, train_op], feed_dict=feed)
                    step_end_time = time.time()
                    if not multi_gpu or my_gpu_rank == 0:
                        best_cost_str = 'N/A' if epoch <= 1 else '{:.5f}'.format(best_cost)
                        logger.info("Epoch:{:-4d}, ES:{:-6d}, GS:{:-6d}, Loss:{:.5f}, BestLoss:{}, Time:{:.3f}".format(epoch, current_epoch_step, current_global_step, ctc_cost, best_cost_str, step_end_time - step_start_time))
                # Ignore best_cost during Epoch 1 run...
                if epoch > 1 and ctc_cost < best_cost: 
                    lbest_cost = best_cost
                    best_cost = ctc_cost
                print('Epoch: {}/{}, Step: {:-6d}/{} {:-3}% -- [Loss: {:-9.5f}, BestLoss: {:-9.5f}, Time: {:.4f}]'.format(epoch, num_epochs, current_epoch_step, steps_per_epoch, b_perc, ctc_cost, best_cost, step_end_time - step_start_time), end='\r')
                sys.stdout.flush()
                # Save every 'n' steps or when find a better best_cost
                if (current_global_step % cp_freq == 0 or best_cost < lbest_cost) and my_gpu_rank == 0:
                    print('\n*** Saving checkpoint at Epoch {}, Step {} (GS: {})'.format(epoch, current_epoch_step, current_global_step))
                    lbest_cost = best_cost
                    saved_path = saver.save(session, os.path.join(model_dir, 'model'), global_step = current_global_step)
                    write_history(model_dir, epoch, current_epoch_step, ctc_cost, best_cost, current_global_step, saved_path)
                current_epoch_step += 1
                current_global_step += num_gpus
                # Let's initiate the garbage collector to maintain acceptable RAM usage
                gc.collect()
            epoch += 1
            current_epoch_step = 0
            remaining_epoch_steps = steps_per_epoch
    print('\n')