Beispiel #1
0
def main():
    input_dim = 6
    spatial_dims = [0, 1, 2]
    args = utils.read_args()

    experiment_dir = utils.get_experiment_dir(args.name, args.run)
    utils.initialize_experiment_if_needed(experiment_dir, args.evaluate)
    # Logger will print to stdout and logfile
    utils.initialize_logger(experiment_dir)

    # Optionally restore arguments from previous training
    # Useful if training is interrupted
    if not args.evaluate:
        try:
            args = utils.load_args(experiment_dir)
        except:
            args.best_tpr = 0.0
            args.nb_epochs_complete = 0  # Track in case training interrupted
            utils.save_args(experiment_dir, args)  # Save initial args

    net = utils.create_or_restore_model(experiment_dir, args.nb_hidden,
                                        args.nb_layer, input_dim, spatial_dims)
    if torch.cuda.is_available():
        net = net.cuda()
        logging.warning("Training on GPU")
        logging.info("GPU type:\n{}".format(torch.cuda.get_device_name(0)))
    criterion = nn.functional.binary_cross_entropy
    if not args.evaluate:
        assert (args.train_file != None)
        assert (args.val_file != None)
        train_loader = construct_loader(args.train_file,
                                        args.nb_train,
                                        args.batch_size,
                                        shuffle=True)
        valid_loader = construct_loader(args.val_file, args.nb_val,
                                        args.batch_size)
        logging.info("Training on {} samples.".format(
            len(train_loader) * args.batch_size))
        logging.info("Validate on {} samples.".format(
            len(valid_loader) * args.batch_size))
        train(net, criterion, args, experiment_dir, train_loader, valid_loader)

    # Perform evaluation over test set
    try:
        net = utils.load_best_model(experiment_dir)
        logging.warning("\nBest model loaded for evaluation on test set.")
    except:
        logging.warning(
            "\nCould not load best model for test set. Using current.")
    assert (args.test_file != None)
    test_loader = construct_loader(args.test_file, args.nb_test,
                                   args.batch_size)
    test_stats = evaluate(net, criterion, experiment_dir, args, test_loader,
                          TEST_NAME)
Beispiel #2
0
def evaluate(sys_argv):

    from utils import read_args

    opts = read_args(args_as_a_list=sys_argv[1:])

    from utils.train import models_path

    evaluate_model_dir_path(models_dir_path=models_path,
                            model_dir_path=opts.model_path,
                            model_epoch_dir_path=opts.model_epoch_path)
Beispiel #3
0
def xnlp_experiments(sys_argv):

    from utils import read_args

    opts = read_args(args_as_a_list=sys_argv[1:], for_xnlp=True)

    from utils.train import models_path

    model, data_dict, id_to_tag, word_to_id, stats_dict = do_xnlp(
        models_dir_path=models_path,
        model_dir_path=opts.model_path,
        model_epoch_dir_path=opts.model_epoch_path)
Beispiel #4
0
def start_webapp(sys_argv):

    from utils import read_args

    opts = read_args(args_as_a_list=sys_argv[1:])

    assert type(opts.port) == int

    print("Creating app object")
    app = make_app(opts)
    print("Listening")
    app.listen(opts.port)
    print("Starting the loop")
    tornado.ioloop.IOLoop.current().start()
Beispiel #5
0
def predict_from_stdin(sys_argv):

    from utils import read_args

    opts = read_args(args_as_a_list=sys_argv[1:])

    from utils.train import models_path

    model, opts, parameters = initialize_model_with_pretrained_parameters(
        opts.model_path, opts.model_epoch_path, models_path)

    line = sys.stdin.readline()
    while line:
        # "ali ata bak\ndeneme deneme"
        predict_sentences_given_model(line.decode("utf8"), model)
        line = sys.stdin.readline()
Beispiel #6
0
 def build_model(self, netpath: str = None):
     if self.outchannel is None:
         self.outchannel = self.img_.shape[1]
 
     if self.args.net == "load":
         _args = u.read_args(os.path.join('results', *netpath.split('/')[:-1], "args.txt"))
         assert net_args_are_same(self.args, _args)
         self.net = get_net(_args, self.outchannel).type(self.dtype)
         self.net.load_state_dict(torch.load(os.path.join('results', netpath)))
     else:
         self.net = get_net(self.args, self.outchannel).type(self.dtype)
         u.init_weights(self.net, self.args.inittype, self.args.initgain)
 
     # self.net = self.net.type(self.dtype)
     #
     # if self.args.net != 'load':
     #     u.init_weights(self.net, self.args.inittype, self.args.initgain)
     self.parameters = u.get_params('net', self.net, self.input_)
     self.num_params = sum(np.prod(list(p.size())) for p in self.net.parameters())
Beispiel #7
0
def testar(ntwk, rodada, dados_carregado, largura):
    with open('base_BF_rod' + str(rodada) + '.pkl', 'rb') as f:
        base = dill.load(f)

    lista, y_train = base[0], base[2]

    args = utils.read_args('configs/default.ast')
    num_epochs, nnet_args = args['num_epochs'], args['nnet_args']
    chars = utils.mapeamento_palavra()
    num_classes = len(chars)

    num_samples = len(lista)
    printer = utils.Printer(chars)

    data_x, data_y = formata_padrao_entrada_saida(num_classes, y_train,
                                                  dados_carregado, lista,
                                                  largura)

    img_ht = data_x[0].shape[0]

    acertos = 0
    erros = 0
    quantidade_total = len(lista)

    for c in range(1):
        for i in range(len(lista)):
            # print(lista[i])
            x = data_x[i]
            y = data_y[i]

            _, esperado = printer.yprint2(y)
            saida_obtida, _ = ntwk.tester(x)
            rotulo_pred, retornado2 = printer.rotulo_(saida_obtida)

            resultado = verificar(retornado2)

            if (esperado == resultado):
                acertos += 1
            else:
                erros += 1

    return acertos, erros
Beispiel #8
0
def main():
    # read args
    args = u.read_args()
    u.create_directories(args)

    #create classification model
    c = Classifier(args)

    #if training flag is true build model and train it
    if args['train']:

        model = c.build()
        plot_model(model,
                   to_file=args['exp_dir'] + 'modelimage' + '.png',
                   show_layer_names=False,
                   show_shapes=False)
        operator = Train(model, args)
        operator.train()
        operator.validate()

    #if test is true, load best model and test it
    if args['test']:
        #load data only without creating model
        operator = Train(None, args)
        operator.validate()
        true, predicted = operator.test()

        #plot confusion matrix
        class_names = ['0', '1']
        cf = confusion_matrix(true, predicted)
        plt.figure()
        u.plot_confusion_matrix(
            cf,
            classes=class_names,
            normalize=False,
            title='Confusion matrix, without normalization')
Beispiel #9
0
# http://rosalind.info/problems/fib/

from utils import read_args

# n = number of months
# k = number of offspring pairs on pair produces
n, k = read_args(2)

# they produce offspring after 2 months
wabbits = [1, 1]

while len(wabbits) < n:
    wabbits.append(wabbits[-1] + wabbits[-2] * k)

print(wabbits[-1])
Beispiel #10
0
def run_a_single_configuration_without_fabric(
        crf, lr_method, dropout, char_dim, char_lstm_dim, morpho_tag_dim,
        morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index,
        word_dim, word_lstm_dim, cap_dim, separate_bilstms, skip_testing,
        max_epochs, train_filepath, dev_filepath, test_filepath,
        embeddings_filepath, reload, _run):

    from sacred.observers import MongoObserver
    """
    python train.py --pre_emb ../../data/we-300.txt --train dataset/tr.train --dev dataset/tr.test --test dataset/tr.test --word_dim 300  --word_lstm_dim 200 --word_bidirect 1 --cap_dim 100 --crf 1 --lr_method=sgd-lr_0.01 --maximum-epochs 100 --char_dim 200 --char_lstm_dim 200 --char_bidirect 1 --morpho_tag_dim 100 --morpho_tag_lstm_dim 100 --morpho_tag_type char --overwrite-mappings 1 --batch-size 5
    """

    execution_part = "python train.py "

    if word_dim == 0:
        embeddings_part = ""
    else:
        embeddings_part = "--pre_emb ../../datasets/%s " % embeddings_filepath

    print(train_filepath, dev_filepath, test_filepath, skip_testing,
          max_epochs)

    always_constant_part = "-T ../../datasets/%s " \
          "-d ../../datasets/%s " \
          "-t ../../datasets/%s " \
          "%s" \
          "--skip-testing %d " \
          "--tag_scheme iobes " \
          "--maximum-epochs %d " % (train_filepath, dev_filepath, test_filepath, embeddings_part, skip_testing, max_epochs)

    commandline_args = always_constant_part + \
              "--crf %d " \
              "--lr_method %s " \
              "--dropout %1.1lf " \
              "--char_dim %d " \
              "--char_lstm_dim %d " \
              "--morpho_tag_dim %d " \
              "--morpho_tag_lstm_dim %d " \
              "--morpho_tag_type %s " \
              "--morpho-tag-column-index %d " \
              "--word_dim %d " \
              "--word_lstm_dim %d "\
              "--cap_dim %d "\
              "--separate-bilstms %d "\
              "--reload %d" % (crf,
                               lr_method,
                               dropout,
                               char_dim,
                               char_lstm_dim,
                               morpho_tag_dim,
                               morpho_tag_lstm_dim,
                               morpho_tag_type,
                               morpho_tag_column_index,
                               word_dim,
                               word_lstm_dim,
                               cap_dim,
                               separate_bilstms,
                               reload)

    tagger_root = "/media/storage/genie/turkish-ner/code/tagger"

    print _run
    print _run.info

    print subprocess.check_output(["id"])
    print subprocess.check_output(["pwd"])

    opts = read_args(commandline_args.split(" "))
    # print opts
    parameters = form_parameters_dict(opts)
    # print parameters
    # model_path = get_name(parameters)
    model_path = get_model_subpath(parameters)
    print model_path

    _run.info['costs'] = dict()
    _run.info['best_performances'] = dict()

    _run.info['starting'] = 1

    dummy_prefix = ""

    print dummy_prefix + execution_part + commandline_args
    process = subprocess.Popen(
        (dummy_prefix + execution_part + commandline_args).split(" "),
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT)

    def record_metric(_run, epoch, samples, label, value):
        if str(epoch) in _run.info[label]:
            _run.info[label][str(epoch)].append(value)
        else:
            _run.info[label][str(epoch)] = list()
            _run.info[label][str(epoch)].append(value)

    for line in iter(process.stdout.readline, ''):
        sys.stdout.write(line)
        m = re.match(
            "^Epoch (\d+): (\d+) Samples read. Avg. cost: ([^,]+), Scores on dev: ([^,]+), (.+)$",
            line)
        if m:
            epoch = int(m.group(1))
            samples = int(m.group(2))
            epoch_avg_cost = float(m.group(3))
            if skip_testing == 1 or dev_filepath == test_filepath:
                epoch_performance = float(m.group(4))
            else:
                epoch_performance = float(m.group(5))
            record_metric(_run, epoch, samples, "costs", epoch_avg_cost)
            record_metric(_run, epoch, samples, "best_performances",
                          epoch_performance)
        sys.stdout.flush()

    # for epoch in range(max_epochs):
    #     epoch_cost = subprocess.check_output(("tail -1 %s" % os.path.join("models", model_path, "epoch-%08d" % epoch, "epoch_cost.txt")).split(" "))
    #     best_performances = subprocess.check_output(("cat %s" % os.path.join("models", model_path, "epoch-%08d" % epoch, "best_performances.txt")).split(" "))
    #     print "EPOCHCOST: " + epoch_cost
    #     _run.info['costs'][str(epoch)] = float(epoch_cost.strip())
    #     print "BESTPERF: " + best_performances
    #     if skip_testing == 1 or dev_filepath == test_filepath:
    #         _run.info['best_performances'][str(epoch)] = float(best_performances.split(" ")[0])
    #     else:
    #         _run.info['best_performances'][str(epoch)] = float(best_performances.split(" ")[1])

    return model_path
Beispiel #11
0
import sys
from datetime import datetime as dt

import editdistance
import numpy as np
import theano as th

import rnn_ctc.neuralnet as nn
# from parscribe import ParScribe as Scribe
from scribe import Scribe
import utils
import telugu as lang
import utils

############################################ Read Args
args = utils.read_args(sys.argv[1:])
num_samples, num_epochs = args['num_samples'], args['num_epochs']
scribe_args, nnet_args = args['scribe_args'], args['nnet_args']

if len(sys.argv) > 1:
    output_fname = '-'.join(sorted(sys.argv[1:]))
    output_fname = output_fname.replace('.ast', '').replace('/', '').replace('configs', '')
else:
    output_fname = "default"
network_fname = '{}.pkl'.format(output_fname)
output_fname += '_' + dt.now().strftime('%y%m%d_%H%M') + '.txt'
distances, wts = [], []
print("Output will be written to: ", output_fname)

# Initialize Language
lang.select_labeler(args['labeler'])
import dill
with open('BLSTM.pkl', 'rb') as pkl_file:
    layer2, layer1, image = dill.load(pkl_file)

rodada = 10
# largura = 60

with open('base_BF_rod' + str(rodada) + '_' + nome_arquivo + '_teste.pkl',
          'rb') as f:
    base = dill.load(f)

lista, y_train = base[0], base[2]

dados_carregado = cd.Carrega()

args = utils.read_args('configs/default.ast')
num_epochs, nnet_args = args['num_epochs'], args['nnet_args']
chars = utils.mapeamento_palavra()
num_classes = len(chars)

num_samples = len(lista)
printer = utils.Printer(chars)

data_x, data_y = [], []

for indice in range(len(lista)):
    y = utils.classe(y_train[indice])  # Recupera a palavra
    y = utils.palavra_indice(y)
    y1 = utils.insere_blanks(y, num_classes)
    data_y.append(np.asarray(y1, dtype=np.int32))
    _, sinal = aux.deslocamento_amostra(lista[indice], larg=70)
Beispiel #13
0
def train_a_single_configuration(
                                              datasets_root,
                                              crf,
                                              lr_method,
                                              batch_size,
                                              sparse_updates_enabled,
                                              dropout,
                                              char_dim,
                                              char_lstm_dim,
                                              morpho_tag_dim,
                                              morpho_tag_lstm_dim,
                                              morpho_tag_type,
                                              morpho_tag_column_index,
                                              word_dim,
                                              word_lstm_dim,
                                              cap_dim, skip_testing, max_epochs,
                                              train_filepath,
                                              dev_filepath,
                                              test_filepath,
                                              yuret_train_filepath,
                                              yuret_test_filepath,
                                              train_with_yuret,
                                              test_with_yuret,
                                              use_golden_morpho_analysis_in_word_representation,
                                              embeddings_filepath,
                                              integration_mode,
                                              active_models,
                                              multilayer,
                                              shortcut_connections,
                                              reload,
                                              dynet_gpu,
                                              _run):

    """
    python train.py --pre_emb ../../data/we-300.txt --train dataset/gungor.ner.train.only_consistent --dev dataset/gungor.ner.dev.only_consistent --test dataset/gungor.ner.test.only_consistent --word_di
m 300  --word_lstm_dim 200 --word_bidirect 1 --cap_dim 100 --crf 1 [email protected] --maximum-epochs 50 --char_dim 200 --char_lstm_dim 200 --char_bid
irect 1 --overwrite-mappings 1 --batch-size 1 --morpho_tag_dim 100 --integration_mode 2
    """

    execution_part = "python main.py --command train --overwrite-mappings 1 "

    if sparse_updates_enabled == 0:
        execution_part += "--disable_sparse_updates "

    if dynet_gpu == 1:
        execution_part += "--dynet-gpu 1 "

    if train_with_yuret == 1:
        execution_part += "--train_with_yuret "

    if use_golden_morpho_analysis_in_word_representation == 1:
        execution_part += "--use_golden_morpho_analysis_in_word_representation "

    if word_dim == 0:
        embeddings_part = ""
    else:
        if embeddings_filepath:
            embeddings_part = "--pre_emb %s/%s " % (datasets_root, embeddings_filepath)
        else:
            embeddings_part = ""

    print (train_filepath, dev_filepath, test_filepath, skip_testing, max_epochs)

    always_constant_part = "-T %s/%s " \
          "-d %s/%s " \
          "-t %s/%s " \
          "%s" \
          "%s" \
          "--yuret_train %s/%s " \
          "--yuret_test %s/%s " \
          "%s" \
          "--skip-testing %d " \
          "--tag_scheme iobes " \
          "--maximum-epochs %d " % (datasets_root, train_filepath,
                                    datasets_root, dev_filepath,
                                    datasets_root, test_filepath,
                                    "--train_with_yuret " if train_with_yuret else "",
                                    "--test_with_yuret " if test_with_yuret else "",
                                    datasets_root, yuret_train_filepath,
                                    datasets_root, yuret_test_filepath,
                                    embeddings_part,
                                    skip_testing, max_epochs)

    commandline_args = always_constant_part + \
              "--crf %d " \
              "--lr_method %s " \
              "--batch-size %d " \
              "--dropout %1.1lf " \
              "--char_dim %d " \
              "--char_lstm_dim %d " \
              "--morpho_tag_dim %d " \
              "--morpho_tag_lstm_dim %d " \
              "--morpho_tag_type %s " \
              "--morpho-tag-column-index %d " \
              "--word_dim %d " \
              "--word_lstm_dim %d "\
              "--cap_dim %d "\
              "--integration_mode %d " \
              "--active_models %d " \
              "--multilayer %d " \
              "--shortcut_connections %d " \
              "--reload %d" % (crf,
                               lr_method,
                               batch_size,
                               dropout,
                               char_dim,
                               char_lstm_dim,
                               morpho_tag_dim,
                               morpho_tag_lstm_dim,
                               morpho_tag_type,
                               morpho_tag_column_index,
                               word_dim,
                               word_lstm_dim,
                               cap_dim,
                               integration_mode,
                               active_models,
                               multilayer,
                               shortcut_connections,
                               reload)

    # tagger_root = "/media/storage/genie/turkish-ner/code/tagger"

    print _run
    print _run.info

    print subprocess.check_output(["id"])
    print subprocess.check_output(["pwd"])

    opts = read_args(args_as_a_list=commandline_args.split(" "))
    print opts
    parameters = form_parameters_dict(opts)
    print parameters
    # model_path = get_name(parameters)
    model_path = get_model_subpath(parameters)
    print model_path

    task_names = ["NER", "MORPH", "YURET"]

    for task_name in task_names:
        _run.info["%s_dev_f_score" % task_name] = dict()
        _run.info["%s_test_f_score" % task_name] = dict()

    _run.info['starting'] = 1

    dummy_prefix = ""

    full_commandline = dummy_prefix + execution_part + commandline_args

    print full_commandline
    process = subprocess.Popen(full_commandline.split(" "),
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT)

    def record_metric(epoch, label, value):
        if str(epoch) in _run.info[label]:
            _run.info[label][str(epoch)].append(value)
        else:
            _run.info[label][str(epoch)] = list()
            _run.info[label][str(epoch)].append(value)

    def capture_information(line):

        # 1
        """
        NER Epoch: %d Best dev and accompanying test score, best_dev, best_test: %lf %lf 
        """
        for task_name in task_names:
            m = re.match("^%s Epoch: (\d+) .* best_dev, best_test: (.+) (.+)$" % task_name, line)
            if m:
                epoch = int(m.group(1))
                best_dev = float(m.group(2))
                best_test = float(m.group(3))

                record_metric(epoch, "%s_dev_f_score" % task_name, best_dev)
                record_metric(epoch, "%s_test_f_score" % task_name, best_test)

    for line in iter(process.stdout.readline, ''):
        sys.stdout.write(line)
        capture_information(line)
        sys.stdout.flush()

    return model_path
import numpy as np
import tensorflow as tf

from utils import read_args, form_parameters_dict, models_path, eval_script, eval_temp, iobes_iob

import loader
from loader import calculate_global_maxes, update_tag_scheme, \
    word_mapping, augment_with_pretrained, char_mapping, tag_mapping, prepare_dataset

from model_tensorflow import Model

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("eval")

# Read parameters from command line
opts = read_args(evaluation=True)

# Parse parameters
parameters = form_parameters_dict(opts)

# Check parameters validity
assert os.path.isfile(opts.train)
assert os.path.isfile(opts.dev)
assert os.path.isfile(opts.test)
assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0
assert 0. <= parameters['dropout'] < 1.0
assert parameters['t_s'] in ['iob', 'iobes']
assert not parameters['all_emb'] or parameters['pre_emb']
assert not parameters['pre_emb'] or parameters['word_dim'] > 0
assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb'])
Beispiel #15
0
                    self.err_best_g = self.err[j]

            self.update(args)
            if args.verbose:
                print("#", i + 1, "\tBest Solution:\t ", self.err_best_g)
            i += 1

        return self.err_best_g, self.pos_best_g


if __name__ == "__main__":

    initial = []
    bounds = []

    args = read_args()

    if args.fn == 1:
        fn = fn1
    else:
        print("ERROR : FUNCTION NOT FOUND")

    box_limit = [-args.box, args.box]

    for i in range(args.d):
        initial.append(args.x0)
        bounds.append(box_limit)

    pso = Swarm(args, bounds)

    start = time.time()
import sys
sys.path.append("..")

from utils import slab_print, read_args
import telugu as language
import scribe

args = read_args(sys.argv[1:], default='../configs/default.ast')
scriber = scribe.Scribe(language, **args['scribe_args'])

try:
    while True:
        image, text, labels = scriber.get_text_image()
        slab_print(image)
        print(image.shape)
        print(labels)
        # print("Twist: {:.3f}".format(angle), fp)
        # print(text)
        print(scriber)
        print("Press Enter to continue and Ctrl-D to quit.")
        input()
except (KeyboardInterrupt, EOFError):
    pass
Beispiel #17
0
import tensorflow as tf

import loader
from loader import augment_with_pretrained, calculate_global_maxes
from loader import update_tag_scheme, prepare_dataset
from loader import word_mapping, char_mapping, tag_mapping
# from model import Model
from model_tensorflow import Model
from utils import models_path, evaluate, eval_script, eval_temp
from utils import read_args, form_parameters_dict

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("main")

# Read parameters from command line
opts = read_args()

# Parse parameters
parameters = form_parameters_dict(opts)

# Check parameters validity
assert os.path.isfile(opts.train)
assert os.path.isfile(opts.dev)
assert os.path.isfile(opts.test)
assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0
assert 0. <= parameters['dropout'] < 1.0
assert parameters['t_s'] in ['iob', 'iobes']
assert not parameters['all_emb'] or parameters['pre_emb']
assert not parameters['pre_emb'] or parameters['word_dim'] > 0
assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb'])
Beispiel #18
0
def show_results(res_dir: Path or str,
                 opts: dict = None,
                 curves: int = 0,
                 savefig=False):
    res_dir = Path(res_dir)
    args = u.read_args(res_dir / "args.txt")
    print(args.__dict__)

    inputs = np.load(os.path.join(args.imgdir, args.imgname),
                     allow_pickle=True)

    if opts is None:
        opts = dict()
    if 'clipval' not in opts.keys():
        opts['clipval'] = u.clim(inputs, 98)
    if 'save_opts' not in opts.keys():
        opts['save_opts'] = {
            'format': 'png',
            'dpi': 150,
            'bbox_inches': 'tight'
        }

    outputs, hist = reconstruct_patches(args,
                                        return_history=True,
                                        verbose=True)
    if outputs.shape != inputs.shape:
        print("\n\tWarning! Outputs and Inputs have different shape! %s - %s" %
              (outputs.shape, inputs.shape))
        inputs = inputs[:outputs.shape[0], :outputs.shape[1]]
        if inputs.ndim == 3:
            inputs = inputs[:, :, :outputs.shape[2]]

    # plot output volume
    if savefig:
        u.explode_volume(outputs, filename=res_dir / "output", **opts)
    else:
        u.explode_volume(outputs, **opts)

    # plot curves
    if curves > 0:
        if len(hist) <= curves:
            idx = range(len(hist))
        else:
            idx = sample(range(len(hist)), curves)
            idx.sort()

        fig, axs = plt.subplots(1, 4, figsize=(18, 4))

        for i in idx:
            axs[0].plot(hist[i].loss, label='patch %d' % i)
            axs[1].plot(hist[i].snr, label='patch %d' % i)
            axs[2].plot(hist[i].pcorr, label='patch %d' % i)
            try:
                axs[3].plot(hist[i].lr, label='patch %d' % i)
            except AttributeError:
                pass

        try:
            axs[0].set_title('LOSS %s' % args.loss)
        except AttributeError:
            axs[0].set_title('LOSS mae')
        axs[1].set_title('SNR = %.2f dB' % u.snr(outputs, inputs))
        axs[2].set_title('PCORR = %.2f %%' % (u.pcorr(outputs, inputs) * 100))
        axs[3].set_title('Learning Rate')

        for a in axs:
            a.legend()
            a.set_xlim(0, args.epochs)
            a.grid()

        axs[0].set_ylim(0)
        axs[1].set_ylim(0)
        axs[2].set_ylim(0, 1)
        axs[3].set_ylim(0, args.lr * 10)

        plt.suptitle(res_dir)
        plt.tight_layout(pad=.5)
        if savefig:
            plt.savefig(res_dir / f"curves.{opts['save_opts']['format']}",
                        **opts['save_opts'])
        plt.show()
Beispiel #19
0
def main(argv=None):  # pylint: disable=unused-argument

  # if tf.gfile.Exists(FLAGS.eval_dir):
  #   tf.gfile.DeleteRecursively(FLAGS.eval_dir)
  # tf.gfile.MakeDirs(FLAGS.eval_dir)

  # Read parameters from command line
  opts = read_args(evaluation=True)

  # Parse parameters
  parameters = form_parameters_dict(opts)

  # Check parameters validity
  assert os.path.isfile(opts.train)
  assert os.path.isfile(opts.dev)
  assert os.path.isfile(opts.test)
  assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0
  assert 0. <= parameters['dropout'] < 1.0
  assert parameters['t_s'] in ['iob', 'iobes']
  assert not parameters['all_emb'] or parameters['pre_emb']
  assert not parameters['pre_emb'] or parameters['word_dim'] > 0
  assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb'])

  # Check evaluation script / folders
  if not os.path.isfile(eval_script):
      raise Exception('CoNLL evaluation script not found at "%s"' % eval_script)
  if not os.path.exists(eval_temp):
      os.makedirs(eval_temp)
  if not os.path.exists(models_path):
      os.makedirs(models_path)
  event_logs_path = os.path.join(eval_temp, "eval_logs")
  # if not os.path.exists(event_logs_path):
  #     os.makedirs(event_logs_path)

  # Initialize model
  model = MainTaggerModel(parameters=parameters, models_path=models_path,
                          overwrite_mappings=opts.overwrite_mappings)
  print "MainTaggerModel location: %s" % model.model_path

  # Data parameters
  lower = parameters['lower']
  zeros = parameters['zeros']
  tag_scheme = parameters['t_s']

  max_sentence_lengths = {}
  max_word_lengths = {}

  # Load sentences
  train_sentences, max_sentence_lengths['train'], max_word_lengths['train'] = \
      loader.load_sentences(opts.train, lower, zeros)
  dev_sentences, max_sentence_lengths['dev'], max_word_lengths['dev'] = loader.load_sentences(
      opts.dev, lower, zeros)
  test_sentences, max_sentence_lengths['test'], max_word_lengths['test'] = loader.load_sentences(
      opts.test, lower, zeros)

  global_max_sentence_length, global_max_char_length = \
      calculate_global_maxes(max_sentence_lengths, max_word_lengths)

  # Use selected tagging scheme (IOB / IOBES)
  update_tag_scheme(train_sentences, tag_scheme)
  update_tag_scheme(dev_sentences, tag_scheme)
  update_tag_scheme(test_sentences, tag_scheme)

  # Create a dictionary / mapping of words
  # If we use pretrained embeddings, we add them to the dictionary.
  if parameters['pre_emb']:
      dico_words_train = word_mapping(train_sentences, lower)[0]
      dico_words, word_to_id, id_to_word = augment_with_pretrained(
          dico_words_train.copy(),
          parameters['pre_emb'],
          list(itertools.chain.from_iterable(
              [[w[0] for w in s] for s in dev_sentences + test_sentences])
          ) if not parameters['all_emb'] else None
      )
  else:
      dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
      dico_words_train = dico_words

  # Create a dictionary and a mapping for words / POS tags / tags
  dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
  dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

  if opts.overwrite_mappings:
      print 'Saving the mappings to disk...'
      model.save_mappings(id_to_word, id_to_char, id_to_tag)

  model.reload_mappings()

  # Index data
  train_buckets, train_stats, train_unique_words = prepare_dataset(
      train_sentences, word_to_id, char_to_id, tag_to_id,
      global_max_sentence_length, global_max_char_length,
      lower
  )
  dev_buckets, dev_stats, dev_unique_words = prepare_dataset(
      dev_sentences, word_to_id, char_to_id, tag_to_id,
      global_max_sentence_length, global_max_char_length,
      lower
  )
  test_buckets, test_stats, test_unique_words = prepare_dataset(
      test_sentences, word_to_id, char_to_id, tag_to_id,
      global_max_sentence_length, global_max_char_length,
      lower
  )

  print "%i / %i / %i sentences in train / dev / test." % (
      len(train_stats), len(dev_stats), len(test_stats))

  print "%i / %i / %i words in train / dev / test." % (
      sum([x[0] for x in train_stats]), sum([x[0] for x in dev_stats]),
      sum([x[0] for x in test_stats]))

  print "%i / %i / %i longest sentences in train / dev / test." % (
      max([x[0] for x in train_stats]), max([x[0] for x in dev_stats]),
      max([x[0] for x in test_stats]))

  print "%i / %i / %i shortest sentences in train / dev / test." % (
      min([x[0] for x in train_stats]), min([x[0] for x in dev_stats]),
      min([x[0] for x in test_stats]))

  for i, label in [[2, 'char']]:
      print "%i / %i / %i total %s in train / dev / test." % (
          sum([sum(x[i]) for x in train_stats]), sum([sum(x[i]) for x in dev_stats]),
          sum([sum(x[i]) for x in test_stats]),
          label)

      print "%i / %i / %i max. %s lengths in train / dev / test." % (
          max([max(x[i]) for x in train_stats]), max([max(x[i]) for x in dev_stats]),
          max([max(x[i]) for x in test_stats]),
          label)

      print "%i / %i / %i min. %s lengths in train / dev / test." % (
          min([min(x[i]) for x in train_stats]), min([min(x[i]) for x in dev_stats]),
          min([min(x[i]) for x in test_stats]),
          label)

  print "Max. sentence lengths: %s" % max_sentence_lengths
  print "Max. char lengths: %s" % max_word_lengths

  for label, bin_stats, n_unique_words in [['train', train_stats, train_unique_words],
                                           ['dev', dev_stats, dev_unique_words],
                                           ['test', test_stats, test_unique_words]]:
      int32_items = len(train_stats) * (
          max_sentence_lengths[label] * (5 + max_word_lengths[label]) + 1)
      float32_items = n_unique_words * parameters['word_dim']
      total_size = int32_items + float32_items
      logging.info("Input ids size of the %s dataset is %d" % (label, int32_items))
      logging.info("Word embeddings (unique: %d) size of the %s dataset is %d" % (
          n_unique_words, label, float32_items))
      logging.info("Total size of the %s dataset is %d" % (label, total_size))

  batch_size = 5

  # Build the model
  cost, train_step, tag_scores, tag_ids, word_ids, \
  crf_transition_params, sentence_lengths, enqueue_op, placeholders = model.build(
      max_sentence_length_scalar=global_max_sentence_length,
      max_word_length_scalar=global_max_char_length,
      batch_size_scalar=batch_size,
      **parameters)

  FLAGS = tf.app.flags.FLAGS

  tf.app.flags.DEFINE_string('eval_dir', event_logs_path,
                             """Directory where to write event logs.""")
  tf.app.flags.DEFINE_string('eval_data', 'test',
                             """Either 'test' or 'train_eval'.""")
  tf.app.flags.DEFINE_string('checkpoint_dir', model.model_path,
                             """Directory where to read model checkpoints.""")
  tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5,
                              """How often to run the eval.""")
  tf.app.flags.DEFINE_integer('num_examples', 10000,
                              """Number of examples to run.""")
  tf.app.flags.DEFINE_boolean('run_once', False,
                              """Whether to run eval only once.""")

  evaluate(model,
           dev_buckets, test_buckets,
           FLAGS, opts,
           id_to_tag,
           batch_size,
           placeholders,
           enqueue_op, tag_scores, tag_ids, word_ids, crf_transition_params, sentence_lengths,
           FLAGS.eval_dir,
           tag_scheme)
Beispiel #20
0

def prob(a, b):
    # return a cross product of the two tuples
    children = list(itertools.product(a, b))
    return 1.0 * len(filter(has_allele, children)) / len(children)


def generate_prob_table():
    pairings = itertools.combinations_with_replacement([dg, hg, rg], 2)

    probability_table = dict((p, prob(*p)) for p in pairings)
    return probability_table

dg = 1, 1
hg = 1, 0
rg = 0, 0
prob_table = generate_prob_table()
# up to here we are just precomputing the probabilities.
# its not even necessary honestly, a bit of premature optimization

d, h, r = read_args(3)
population = itertools.chain([dg] * d, [hg] * h, [rg] * r)

probs = map(lambda x: prob_table[x], itertools.combinations(population, 2))

# since we're already dividing by 4 in prob(), we aren't talking about
# integer numbers, so we take the mean
# generally we can't do this, but the values cancel out and it works out here
print('%.5f' % (sum(probs) / len(probs)))
import sys
sys.path.append("..")

from utils import slab_print, read_args
import telugu as language
import scribe

args = read_args(sys.argv[1:],  default='../configs/default.ast')
scriber = scribe.Scribe(language, **args['scribe_args'])

try:
  while True:
    image, text, labels = scriber.get_text_image()
    slab_print(image)
    print(image.shape)
    print(labels)
    # print("Twist: {:.3f}".format(angle), fp)
    # print(text)
    print(scriber)
    print("Press Enter to continue and Ctrl-D to quit.")
    input()
except (KeyboardInterrupt, EOFError):
    pass
Beispiel #22
0
import pickle
import sys
import numpy as np
import theano as th
from nnet.neuralnet import NeuralNet
import utils

# th.config.optimizer = 'fast_compile'
# th.config.exception_verbosity='high'

################################### Main Script ###########################
print('Loading the dataset.')
with open(sys.argv[1], 'rb') as pkl_file:
    data = pickle.load(pkl_file)

args = utils.read_args(sys.argv[2:])
num_epochs, train_on_fraction = args['num_epochs'], args['train_on_fraction']
scribe_args, nnet_args,  = args['scribe_args'], args['nnet_args'],

chars = data['chars']
num_classes = len(chars)
img_ht = len(data['x'][0])
num_samples = len(data['x'])
nTrainSamples = int(num_samples * train_on_fraction)
printer = utils.Printer(chars)

print('\nInput Dim: {}'
      '\nNum Classes: {}'
      '\nNum Samples: {}'
      '\nNum Epochs: {}'
      '\nFloatX: {}'
Beispiel #23
0
def train_a_single_configuration(
        lang_name, datasets_root, crf, lr_method, batch_size,
        sparse_updates_enabled, dropout, char_dim, char_lstm_dim,
        morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type,
        morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim,
        skip_testing, starting_epoch_no, maximum_epochs, file_format, debug,
        ner_train_file, ner_dev_file, ner_test_file, md_train_file,
        md_dev_file, md_test_file,
        use_golden_morpho_analysis_in_word_representation, embeddings_filepath,
        integration_mode, active_models, multilayer, shortcut_connections,
        reload, model_path, model_epoch_path, dynet_gpu, _run):
    """
    python train.py --pre_emb ../../data/we-300.txt --train dataset/gungor.ner.train.only_consistent --dev dataset/gungor.ner.dev.only_consistent --test dataset/gungor.ner.test.only_consistent --word_di
m 300  --word_lstm_dim 200 --word_bidirect 1 --cap_dim 100 --crf 1 [email protected] --maximum-epochs 50 --char_dim 200 --char_lstm_dim 200 --char_bid
irect 1 --overwrite-mappings 1 --batch-size 1 --morpho_tag_dim 100 --integration_mode 2
    """

    execution_part = "python main.py --command train --overwrite-mappings 1 "

    if sparse_updates_enabled == 0:
        execution_part += "--disable_sparse_updates "

    if dynet_gpu == 1:
        execution_part += "--dynet-gpu 1 "

    if use_golden_morpho_analysis_in_word_representation == 1:
        execution_part += "--use_golden_morpho_analysis_in_word_representation "

    execution_part += "--debug " + str(debug) + " "

    if word_dim == 0:
        embeddings_part = ""
    else:
        if embeddings_filepath:
            embeddings_part = "--pre_emb %s/%s " % (datasets_root,
                                                    embeddings_filepath)
        else:
            embeddings_part = ""

    always_constant_part = "--lang_name %s --file_format %s " \
                           "--ner_train_file %s/%s/%s " \
                           "%s" \
                           "--ner_test_file %s/%s/%s " \
                           "--md_train_file %s/%s/%s " \
                           "%s" \
                           "--md_test_file %s/%s/%s " \
                           "%s" \
                           "--skip-testing %d " \
                           "--tag_scheme iobes " \
                           "--starting-epoch-no %d " \
                           "--maximum-epochs %d " % (lang_name, file_format,
                                                     datasets_root, lang_name, ner_train_file,
                                                     ("--ner_dev_file %s/%s/%s " % (datasets_root, lang_name, ner_dev_file)) if ner_dev_file else "",
                                                     datasets_root, lang_name, ner_test_file,
                                                     datasets_root, lang_name, md_train_file,
                                                     ("--md_dev_file %s/%s/%s " % (datasets_root, lang_name,
                                                                                    md_dev_file)) if md_dev_file else "",
                                                     datasets_root, lang_name, md_test_file,
                                                     embeddings_part,
                                                     skip_testing, starting_epoch_no, maximum_epochs)

    if reload == 1:
        reload_part = "--reload %d --model_path %s --model_epoch_path %s " % (
            reload, model_path, model_epoch_path)
    else:
        reload_part = "--reload 0 "

    commandline_args = always_constant_part + \
              "--crf %d " \
              "--lr_method %s " \
              "--batch-size %d " \
              "--dropout %1.1lf " \
              "--char_dim %d " \
              "--char_lstm_dim %d " \
              "--morpho_tag_dim %d " \
              "--morpho_tag_lstm_dim %d " \
              "--morpho_tag_type %s " \
              "--morpho-tag-column-index %d " \
              "--word_dim %d " \
              "--word_lstm_dim %d "\
              "--cap_dim %d "\
              "--integration_mode %d " \
              "--active_models %d " \
              "--multilayer %d " \
              "--shortcut_connections %d " \
              "%s" % (crf,
                               lr_method,
                               batch_size,
                               dropout,
                               char_dim,
                               char_lstm_dim,
                               morpho_tag_dim,
                               morpho_tag_lstm_dim,
                               morpho_tag_type,
                               morpho_tag_column_index,
                               word_dim,
                               word_lstm_dim,
                               cap_dim,
                               integration_mode,
                               active_models,
                               multilayer,
                               shortcut_connections,
                               reload_part)

    # tagger_root = "/media/storage/genie/turkish-ner/code/tagger"

    print(_run)
    print(_run.info)

    print(subprocess.check_output(["id"]))
    print(subprocess.check_output(["pwd"]))

    opts = read_args(args_as_a_list=commandline_args.split(" "))
    print(opts)
    parameters = form_parameters_dict(opts)
    print(parameters)
    # model_path = get_name(parameters)
    model_path = get_model_subpath(parameters)
    print(model_path)

    task_names = ["NER", "MORPH"]

    for task_name in task_names:
        _run.info["%s_dev_f_score" % task_name] = dict()
        _run.info["%s_test_f_score" % task_name] = dict()

    _run.info["avg_loss"] = dict()

    _run.info['starting'] = 1

    dummy_prefix = ""

    full_commandline = dummy_prefix + execution_part + commandline_args

    print(full_commandline)
    process = subprocess.Popen(full_commandline.split(" "),
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT)

    def record_metric(epoch, label, value):
        """
        Each label can have multiple values in an epoch. This is for updates to the metric's value.
        i.e. metrics calculated before an epoch has finished.
        :param epoch:
        :param label:
        :param value:
        :return:
        """
        epoch_str = str(epoch)
        if epoch_str in _run.info[label]:
            _run.info[label][epoch_str].append(value)
        else:
            _run.info[label][epoch_str] = [value]

    def capture_information(line):

        # 1
        """
        NER Epoch: %d Best dev and accompanying test score, best_dev, best_test: %lf %lf
        """
        for task_name in task_names:
            m = re.match(
                "^.*%s Epoch: (\d+) .* best_dev, best_test: (.+) (.+)$" %
                task_name, line)
            if m:
                epoch = int(m.group(1))
                best_dev = float(m.group(2))
                best_test = float(m.group(3))

                record_metric(epoch, "%s_dev_f_score" % task_name, best_dev)
                record_metric(epoch, "%s_test_f_score" % task_name, best_test)

        m = re.match("^.*Epoch (\d+) Avg. loss over training set: (.+)$", line)
        if m:
            epoch = int(m.group(1))
            avg_loss_over_training_set = float(m.group(2))
            record_metric(epoch, "avg_loss", avg_loss_over_training_set)
        """
        MainTaggerModel location: ./models/model-00000227
        """
        m = re.match("^.*MainTaggerModel location: (.+)$", line)
        if m:
            model_dir_path = m.group(1)
            _run.info["model_dir_path"] = model_dir_path
        """
        LOG: model_epoch_dir_path: {}
        """
        m = re.match("^.*LOG: model_epoch_dir_path: (.+)$", line)
        if m:
            model_epoch_dir_path = m.group(1)
            _run.info["model_epoch_dir_path"] = model_epoch_dir_path

    for line in process.stdout:
        sys.stdout.write(line.decode("utf8"))
        capture_information(line.decode("utf8"))
        sys.stdout.flush()

    return model_path