Beispiel #1
0
def main(argv):
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    train_folder_name = argv[1]
    dict_file_name = argv[2]
    out_dir = argv[3]
    seq_max_len = int(argv[4]) # 32 seems ok with tags
    lstm_size = int(argv[5])
    epochs = int(argv[6])
    batch_size = int(argv[7]) # 2304 for geforce 780, 40 for cpu or 128

    dictionary = pp.read_gzip_data(dict_file_name)
    dict_size = len(dictionary)

    if epochs > 0:

        if not os.path.isdir(out_dir):
            os.makedirs(out_dir)

        weights_path = out_dir + 'weights.h5'
        model_path = out_dir + 'model.json'
        model = prepareModel(seq_max_len, dict_size, lstm_size, model_path)
    
        if os.path.isfile(weights_path):
            logging.info("Reading weights from existing file...")
            model.load_weights(weights_path)

        train_files = os.listdir(train_folder_name)

        if len(train_files) > 1:
            logging.info("Training on batches...")

            for e in range(epochs):
                logging.info('Starting {0} epoch...'.format(e))
                random.shuffle(train_files)

                for train_file_name in train_files:
                    if '.swp' not in train_file_name:
                        train_data = pp.read_gzip_data(train_folder_name + train_file_name)
                        train_data = pp.convertDataToKerasFormat(train_data, seq_max_len, dictionary)

                        model.train_on_batch(train_data[:-1], train_data[1:])

                logging.info("Saving weights...")
                model.save_weights(weights_path, overwrite = True)

        elif len(train_files) == 1:
            logging.info("Fitting model...")

            train_data = pp.read_gzip_data(train_folder_name + train_files[0])
            train_data = pp.convertDataToKerasFormat(train_data, seq_max_len, dictionary)

            model.fit(train_data[:-1], train_data[1:], batch_size = batch_size, nb_epoch = epochs)

            logging.info("Saving weights...")
            model.save_weights(weights_path, overwrite = True)

        else:
            logging.error("There are no files to train on!")
Beispiel #2
0
def main(argv):
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    data_file_name = argv[1]
    dict_file_name = argv[2]
    working_dir = argv[3]
    seq_max_len = int(argv[4])
    sufix_len = int(argv[5])
    batch_size = 2304
    it_size = batch_size 

    logging.info('Reading model...')
    model_file = open(working_dir + 'model.json', 'r')
    json_model = model_file.read()
    model = model_from_json(json_model)

    logging.info('Reading weights...')
    model.load_weights(working_dir + 'weights.h5')

    dictionary = pp.read_gzip_data(dict_file_name)

    data = pp.read_gzip_data(data_file_name)
    test_data, test_data_count = pp.prepareTestData(data_file_name)

    test_data = pd.prepareData(test_data, seq_max_len, dictionary, sufix_len)
    pp.save_gzip_data(working_dir + 'test_data.txt.gz', test_data) 
    test_data = pp.convertDataToKerasFormat(test_data, seq_max_len, dictionary)

    logging.info('Testing model on data...')
    full_size = len(test_data) 
    it = 0
    res_win = 0
    res_los = 0
    res_inc = 0
    res_all = 0
    it_count = 0
    while it < full_size:
        starting_it_count = it_count
        it2 = it
        for one_count in range(it_count, len(test_data_count)):
            if it2 + test_data_count[one_count] > min(full_size, it + it_size):
                break
            else:
                it_count += 1
                it2 += test_data_count[one_count]
        tmp_test_data = test_data[it : it2]
        tmp_test_data_count = test_data_count[starting_it_count : it_count]
        tmp_results =  model.predict_proba(tmp_test_data, batch_size = batch_size)
        win, los, inc, al = testModel(tmp_test_data, tmp_results, tmp_test_data_count)
        res_win += win
        res_los += los
        res_inc += inc
        res_all += al
        it = it2 

    printFinalResults(res_win, res_los, res_inc, res_all)