def run_model(model_type,
              in_notebook=False,
              is_verbose=True,
              return_model=False):

    if model_type == 'LR':
        print(
            "\nPredicting Speaker Stance - Multi Label Logistic Regression Baseline Model "
        )
        model = baseline.Model()

    elif model_type == 'MultiClassLR':
        print(
            "\nPredicting Speaker Stance - Multi Class Logistic Regression Baseline Model "
        )
        model = multiclassbaseline.Model()

    elif model_type == 'MTNN':
        print("\nPredicting Speaker Stance - Multi Task Model ")
        model = MultiTaskNN.My_Model(is_verbose=is_verbose)

    elif model_type == 'MLP':
        print("\nPredicting Speaker Stance - Multi Layer Perceptron Model ")
        model = MultiLayerPercep.Model(is_verbose=is_verbose)

    elif model_type == 'FastText':
        print("\nPredicting Speaker Stance - FastText Model ")
        model = FastText.My_Model(is_verbose=is_verbose)

    elif model_type == 'MultiClassFastText':
        print("\nPredicting Speaker Stance - FastText Model ")
        model = FastTextMultiClass.My_Model(is_verbose=is_verbose)

    elif model_type == 'FastTextMT':
        print("\nPredicting Speaker Stance - FastText Multi Task Model ")
        model = FastTextMultiTask.My_Model(is_verbose=is_verbose)

    elif model_type == 'FastTextAux':
        print("\nPredicting Speaker Stance - FastText Aux Task Model ")
        model = FastTextAuxTask.My_Model(is_verbose=is_verbose)

    else:
        raise ValueError(
            'Unknown Model Type: {} not supported option'.format(model_type))

    print("Loading Data")

    if in_notebook is True:
        data = Dataset(in_notebook=True)

    else:
        data = Dataset()

    print("Training Model")
    model.train(data.train_set())

    y, y_pred = model.test(data.test_set())
    if return_model is True:
        return model, y, y_pred
    return y, y_pred
def run_model(test_language, evaluation_split, detailed_report):
    """ Trains the CWI model in all languages but one. Tests on all datasets of
        a particular language. Reports results.

    Args:
        test_language:      The language of the dataset to use for testing.
                            evaluation_split: The split of the data to use for
                            evaluating the performance of the model (dev or
                            test).

        detailed_report:    Whether to display a detailed report or just overall
                            score.

    """

    # collect the training data for all the languages but one
    train_data = []
    for language, datasets_names in datasets_per_language.items():
        if language != test_language:
            for dataset_name in datasets_names:
                data = Dataset(language, dataset_name)
                lang_train_set = data.train_set()
                if lang_train_set is None:
                    print("No training data found for language {}.".format(
                        language))
                else:
                    train_data.append(lang_train_set)

    train_data = pd.concat(train_data)

    # train the CWI model
    cwi_model = CrosslingualCWI(list(datasets_per_language.keys()))
    cwi_model.train(train_data)

    # test the model
    test_datasets = datasets_per_language[test_language]

    for dataset_name in test_datasets:
        data = Dataset(test_language, dataset_name)

        print("\nTesting on  {} - {}.".format(test_language, dataset_name))

        if evaluation_split in ["dev", "both"]:
            print("\nResults on Development Data")
            predictions_dev = cwi_model.predict(data.dev_set())
            gold_labels_dev = data.dev_set()['gold_label']
            print(
                report_binary_score(gold_labels_dev, predictions_dev,
                                    detailed_report))

        if evaluation_split in ["test", "both"]:
            print("\nResults on Test Data")
            predictions_test = cwi_model.predict(data.test_set())
            gold_labels_test = data.test_set()['gold_label']
            print(
                report_binary_score(gold_labels_test, predictions_test,
                                    detailed_report))

    print()
Exemple #3
0
def run_model(model_type,
              domain,
              is_verbose=True,
              probs=False,
              get_history=False):
    if model_type == 'logistic_regression':
        model = LR(domain, probs=probs)
    elif model_type == 'fast_text':
        model = FastText(domain, probs=probs, is_verbose=is_verbose)
    elif model_type == 'mlp':
        model = MLP(domain, is_verbose=is_verbose)
    else:
        raise ValueError('incorrect model type choice')

    print("Loading Data")
    data = Dataset()

    print("Training Model")
    hist = model.train(data.train_set())

    if get_history is True:
        return hist

    else:
        y, y_pred = model.test(data.test_set())
        return y, y_pred
def run_model(in_notebook=False):
    print("\nPredicting Speaker Stance - Baseline Model ")
    print("Loading Data")
    if in_notebook is True:
        data = Dataset(in_notebook=True)

    else:
        data = Dataset()

    model = Model()
    print("Training Model")
    model.train(data.train_set())

    print("\nResults on Test Data")
    y, y_pred = model.test(data.test_set())
    print(report_scores(y, y_pred))
Exemple #5
0
def run_model(language, dataset_name, evaluation_split, detailed_report):
    """Trains and tests the CWI model for a particular dataset of a particular language. Reports results.

    Args:
        language: The language of the dataset.
        dataset_name: The name of the dataset (all files should have it).
        evaluation_split: The split of the data to use for evaluating the performance of the model (dev or test).
        detailed_report: Whether to display a detailed report or just overall score.

    """
    print("\nModel for {} - {}.".format(language, dataset_name))

    data = Dataset(language, dataset_name)

    #The code below is used for creating unigram probability csv files

    # if (language == 'spanish'):
    #     corpus_words = nltk.corpus.cess_esp.words()
    #     unigram_counts = Counter(corpus_words)
    #     total_words = len(corpus_words)

    # def calc_unigram_prob(unigram_counts, total_words):
    #     u_prob = {} #defaultdict
    #     for word in unigram_counts:
    #         u_prob[word] = unigram_counts[word]/total_words
    #     return u_prob

    # def save_to_file(u_prob,file_name):
    #     w = csv.writer(open(file_name, "w"))
    #     for word, prob in u_prob.items():
    #         w.writerow([word, prob])
    # print('calc unigram prob: ')

    # u_prob = calc_unigram_prob(unigram_counts, total_words)
    # print('saving file')
    # save_to_file(u_prob, 'data/external/spanish_u_prob.csv')
    # kdfjei

    baseline = MonolingualCWI(language)

    baseline.train(data.train_set())

    if evaluation_split in ["dev", "both"]:
        print("\nResults on Development Data")
        predictions_dev = baseline.predict(data.dev_set())
        gold_labels_dev = data.dev_set()['gold_label']
        print(
            report_binary_score(gold_labels_dev, predictions_dev,
                                detailed_report))

    if evaluation_split in ["test", "both"]:
        print("\nResults on Test Data")
        predictions_test = baseline.predict(data.test_set())
        gold_labels_test = data.test_set()['gold_label']
        print(
            report_binary_score(gold_labels_test, predictions_test,
                                detailed_report))

    print()
Exemple #6
0
def run_model():
    print("\nPredicting Speaker Stance - Baseline Model ")
    print("Loading Data")
    data = Dataset()
    model = Model()

    print("Training Model")
    model.train(data.train_set())

    print("\nResults on Test Data")
    y, y_pred = model.test(data.test_set())
    print(report_scores(y, y_pred))
    def __call__(self, parameters=None, verbose=True):
        Logger.log(verbose, "Loading datasets...")
        Logger.indent()
        p = Parameters(parameters, self.defaultParameters)
        dataset = {}
        for nameset in ["train", "test"]:
            data, labels = gen_reg_data(p.n, p.m)
            Logger.log(verbose, "synthetic " + nameset + " data generated")
            dataset[nameset] = Dataset(p, data, verbose=verbose)

        Logger.dindent()
        Logger.log(verbose, "datasets loaded!\n")
        return [dataset]
Exemple #8
0
 def evaluate(self, dataset, privater, epoch=-1):
     print(f'evaluating {type(self).__name__} on epoch {epoch}')
     train_data = dataset.get_train()
     test_data = dataset.get_test()
     train_data = privater.predict(train_data)
     test_data = privater.predict(test_data)
     dataset = Dataset(train_data=train_data, test_data=test_data)
     self.train_model = self.build_model(train_data)
     early_stopping = EarlyStopping()
     callbacks = [early_stopping]
     trainer = KerasTrainer(batch_size=self.batch_size,
                            epochs=self.epochs,
                            verbose=self.verbose)
     trainer.train(dataset, self, callbacks=callbacks)
     return min(early_stopping.best_val_acc, early_stopping.best_acc)
Exemple #9
0
def show_recognized(predictions):
    """
    Show images for which the system has not produced a valid classification.
    :param images: list with image files
    :param predictions: list with predictions
    """
    d = Dataset(os.path.join(config.conf.AppPath, config.conf.DataPath))
    images = d.test.as_dataframe()['filename']
    labels = d.test.as_dataframe()['label']

    if len(images) != len(predictions):
        raise ValueError(
            'images must contains same number of examples than predictions')

    to_show = images[predictions >= 0]
    for img in to_show:
        plt.imshow(mpimg.imread(img))
        plt.show()
Exemple #10
0
def show_test_errors(predictions):
    """
    Show images for which predictions is different from ground truth label.
    :param images: list with image files
    :param predictions: list with predictions
    :param labels: list with ground truth labels
    """
    d = Dataset(os.path.join(config.conf.AppPath, config.conf.DataPath))
    images = d.test.as_dataframe()['filename']
    labels = d.test.as_dataframe()['label']

    if len(images) != len(predictions) != len(labels):
        raise ValueError(
            'images must contains same number of examples than predictions and labels'
        )

    to_show = images[(predictions >= 0) & (predictions != labels)]
    for img in to_show:
        plt.imshow(mpimg.imread(img))
        plt.show()
Exemple #11
0
    def __call__(self, parameters=None, verbose=True):
        Logger.log(verbose, "Loading datasets...")
        Logger.indent()
        p = Parameters(parameters, self.defaultParameters)
        dataset = {}
        for nameset in ["train", "test"]:
            data, names = load_data(nameset,
                                    k=p.k,
                                    mat=p.mat,
                                    small=p.small,
                                    nsmall=p.nsmall,
                                    givename=True)
            names = "(" + " and ".join(names) + ")"
            Logger.log(verbose, nameset + " data loaded! " + names)

            dataset[nameset] = Dataset(p, *data, verbose=verbose)

        Logger.dindent()
        Logger.log(verbose, "datasets loaded!\n")
        return [dataset]
def run_model(model_type, domain, trainset, testset, is_verbose, **kwargs):
    if model_type == 'fast_text':
        model = FastText(domain, is_verbose=is_verbose, **kwargs)

    elif model_type == 'logistic_regression':
        model = LR(domain)

    elif model_type == 'mlp':
        model = MLP(domain, is_verbose=is_verbose, **kwargs)
    else:
        raise ValueError('incorrect model type choice {}'.format(model_type))

    print("Loading Data")
    data = Dataset()

    print("Training Model")
    model.train(trainset)

    y, y_pred = model.test(testset)
    return y, y_pred
def run_experiment(config, run_training, run_testing):
    """Runs an experiment specified by a config file and cli args.

  Args:
    config: NamedTuple, a config.yaml file which has been parsed into an object.
    run_training: bool, whether to run the training pipeline.
    run_testing: bool, whether to run the testing pipeline.

  Returns:
    results: list(dict), a list of dictionaries, one per model specified in
      the `config`. Each dictionary contains performance metrics and
      hyperparameters which are specific to that model.
  """
    # Boilerplate: set seeds and create working dir.
    set_seed(config.seed)
    if not os.path.exists(config.working_dir):
        os.MakeDirs(config.working_dir)
    utils.write_config(config, os.path.join(config.working_dir, 'config.yaml'))

    print('MAIN: parsing dataset')
    start = time.time()
    dataset = Dataset(config, config.working_dir)
    print('MAIN: dataset done. took %.2fs' % (time.time() - start))

    # Train & test each of the models which are listed in the config.
    results = []
    for model_description in config.model_spec:
        if model_description.get('skip', False):
            continue
    # try:

        if run_training:
            train_model(model_description, config, dataset)
        if run_testing:
            results += test_model(model_description, config, dataset)

        #except Exception as e:
        #  print(str(e))

    return results
def KernelTest(kernelname, parameters, synth=False):
    Dataset = findData("allseq")()[0]
    if synth:
        import numpy as np
        from src.data.dataset import Dataset

        defaultParameters = {
            "k": 0,
            "mat": False,
            "shuffle": False,
            "small": True,
            "nsmall": 200,
            "labels_change": True,
            "name": "seq",
            "nclasses": 2
        }

        from src.tools.utils import Parameters
        p = Parameters(None, defaultParameters)

        train = Dataset(p, np.array(['ATTA', 'AAAA']), np.array([0, 1]))
    else:
        train = Dataset["train"]

    Kernel = findKernel(kernelname)

    Logger.log(True, "Test the " + kernelname + " kernel.")
    Logger.indent()
    kernels = []
    for params in parameters:
        Logger.log(True, "Test with these parameters: " + str(params))
        Logger.indent()
        kernel = Kernel(train, params)
        kernels.append(kernel)
        Logger.log(True, kernel.K)
        Logger.dindent()

    # ipdb.set_trace()
    Logger.dindent()
def run_model(language, dataset_name, evaluation_split, detailed_report, ablate):
    """Trains and tests the CWI model for a particular dataset of a particular language. Reports results.

    Args:
        language: The language of the dataset.
        dataset_name: The name of the dataset (all files should have it).
        evaluation_split: The split of the data to use for evaluating the performance of the model (dev or test).
        detailed_report: Whether to display a detailed report or just overall score.

    """
    score_only = True if ablate else False

    data = Dataset(language, dataset_name)
    #The code below is used for creating unigram probability csv files

        #corp = nltk.corpus.ConllCorpusReader('.', 'tiger_release_aug07.corrected.16012013.conll09',
                                    # ['ignore', 'words', 'ignore', 'ignore', 'ignore'],
                                     #encoding='utf-8')
    # filename = 'europarl-v7.fr-en.fr'
    # file = open(filename, mode='rt', encoding='utf-8')
    # corpus_words = []
    # for line in file:
    #     #print(line)
    #     corpus_words += line.strip(',').strip('.').split()
    #     #print(corpus_words)

    # #corpus_words = corp.words()
    # unigram_counts = Counter(corpus_words)
    # total_words = len(corpus_words)

    # def calc_unigram_prob(unigram_counts, total_words):
    #     u_prob = {} #defaultdict
    #     for word in unigram_counts:
    #         u_prob[word] = unigram_counts[word]/total_words
    #     return u_prob

    # def save_to_file(u_prob,file_name):
    #     w = csv.writer(open(file_name, "w"))
    #     for word, prob in u_prob.items():
    #         w.writerow([word, prob])
    # print('calc unigram prob: ')

    # u_prob = calc_unigram_prob(unigram_counts, total_words)
    # print('saving file')
    # save_to_file(u_prob, 'data/external/french_u_prob.csv')
    # hgiuyo

    baseline = MonolingualCWI(language, ablate)

    baseline.train(data.train_set())


    if evaluation_split in ["dev", "both"]:
        if not score_only:
            print("\nResults on Development Data")
        predictions_dev = baseline.predict(data.dev_set())
        gold_labels_dev = data.dev_set()['gold_label']
        print(report_binary_score(gold_labels_dev, predictions_dev, detailed_report, score_only))


    if evaluation_split in ["test", "both"]:
        if not score_only:
            print("\nResults on Test Data")
        predictions_test = baseline.predict(data.test_set())
        gold_labels_test = data.test_set()['gold_label']
        print(report_binary_score(gold_labels_test, predictions_test, detailed_report, score_only))
    if not score_only:
        print()
Exemple #16
0
def run_experiment(config, args, expt_id):
    # if train, switch the dataset to train, then
    #  train and save each model in the config spec
    if not os.path.exists(config.working_dir):
        os.makedirs(config.working_dir)
    utils.write_config(config, os.path.join(config.working_dir, 'config.yaml'))

    print 'MAIN: parsing dataset'
    d = Dataset(config, config.base_dir)
    print 'MAIN: dataset done. took %.2fs' % (time.time() - start)

    if args.train:
        d.set_active_split(config.train_suffix)

        for model_description in config.model_spec:
            if model_description.get('skip', False):
                continue
            if args.model is not None and args.model != model_description[
                    'type']:
                continue

            print 'MAIN: training ', model_description['name']
            start_time = time.time()
            model_dir = os.path.join(config.working_dir,
                                     model_description['name'])
            if not os.path.exists(model_dir):
                os.makedirs(model_dir)

            model = constants.MODEL_CLASSES[model_description['type']](
                config=config, params=model_description['params'])

            model.train(d, model_dir)
            model.save(model_dir)
            print 'MAIN: training %s done, time %.2fs' % (
                model_description['name'], time.time() - start_time)

    # if test, switch thh datset to test,
    #  and run inference + evaluation for each model
    #  in the config spec
    if args.test:
        d.set_active_split(config.test_suffix)
        results = []  # items to be written in executive summary
        for model_description in config.model_spec:
            if model_description.get('skip', False):
                continue
            if args.model is not None and args.model != model_description[
                    'type']:
                continue

            print 'MAIN: inference with ', model_description['name']
            start_time = time.time()

            model = constants.MODEL_CLASSES[model_description['type']](
                config=config, params=model_description['params'])

            model_dir = os.path.join(config.working_dir,
                                     model_description['name'])
            model.load(d, model_dir)

            predictions = model.inference(d, model_dir)
            utils.pickle(predictions, os.path.join(model_dir, 'predictions'))

            evaluation = evaluator.evaluate(config, d, predictions, model_dir)
            utils.pickle(evaluation, os.path.join(model_dir, 'evaluation'))
            evaluator.write_summary(evaluation, model_dir)
            # store info for executive summary
            results.append({
                'model-name': model_description['name'],
                'model-type': model_description['type'],
                'params': str(model_description['params']),
                'correlation': evaluation['mu_corr'],
                'regression_performance': evaluation['mu_reg_perf'],
                'fixed_performance': evaluation['mu_fixed_perf'],
                'model_dir': model_dir,
                'expt_id': expt_id
            })

            print 'MAIN: evaluation %s done, time %.2fs' % (
                model_description['name'], time.time() - start_time)

        return results
Exemple #17
0
# coding: utf-8
from src.models.baseline import Model
from src.data.dataset import Dataset
data = Dataset().train_set()
y = Model().mlb.fit_transform(data)
        y = self.target_pipe.fit_transform(trainset)
        num_features = X.shape[1]
        self.model = self.build_net(num_features)
        self.model.fit(X, y, epochs=500, batch_size=32, verbose=self.is_verbose)

    def test(self, testset):
        X = self.feature_pipe.transform(testset)
        y = self.mlb.transform(testset)
        print(y.shape)
        y_pred_raw = self.model.predict(X)
        y_pred = np.column_stack(y_pred_raw)  # join the raw outputs into format for sklearn scoring
        print(y_pred.shape)
        threshold = 0.5
        for row in y_pred:
            for i, val in enumerate(row):
                if val > threshold:
                    row[i] = 1
                elif val < threshold:
                    row[i] = 0
        print(y_pred[0])
        return y, y_pred


if __name__ == '__main__':

    from src.data.dataset import Dataset
    data = Dataset()
    model = Model()
    model.train(data.train_set())
    model.test(data.test_set())
Exemple #19
0
def dataset():
    ''' Return Dataset that has loaded the vocab file '''
    vocab = Vocab(VOCAB_PATH)
    return Dataset(vocab)
        '20201005-183234-ig_resnext101_32x8d-320',
        '20201012-184118-ig_seresnext101_32x8d-224',
        '20201014-232454-ig_resnext101_32x16d-320'
    ]

    for checkpoint in checkpoint_list:
        name = '-'.join(checkpoint.split('-')[1:])
        model_list.append(name)

    for index, model_name in enumerate(model_list):
        img_size = int(model_name.split('-')[-1])
        if args.flag == "valid":
            save_weights = True
        else:
            save_weights = False
        dataset = Dataset(os.path.join(data_root, args.flag))

        checkpoint = glob.glob(
            os.path.join(output_root,
                         checkpoint_list[index] + '/*best*.pth.tar'))[0]
        model = create_model('%s' % model_name.split('-')[-2],
                             num_classes=args.num_classes,
                             in_chans=3,
                             checkpoint_path='%s' % checkpoint)
        model = model.cuda()
        model.eval()

        config = resolve_data_config(vars(args), model=model)
        loader = create_loader(dataset,
                               input_size=img_size,
                               batch_size=args.batch_size,
        '20200923-111323-gluon_seresnext101_32x4d-224',
        '20200928-091837-hrnet_w44-224'
    ]
    for checkpoint in checkpoint_list:
        name = '-'.join(checkpoint.split('-')[1:])
        model_list.append(name)

    for index, model_name in enumerate(checkpoint_list):
        img_size = int(model_name.split('-')[-1])
        if args.flag == "valid":
            data_path = os.path.join(
                "/home/data/classification/action/new_data/valid")
        else:
            data_path = "/home/data/classification/action/new_data/test/"
        if data_path.split('/')[-1] == 'valid':
            dataset = Dataset(data_path,
                              transform=tta_test_transform(img_size))
            save_weights = True
        else:
            dataset = TestDataset(data_path,
                                  transform=tta_test_transform(img_size))
            save_weights = False

        checkpoint = glob.glob(
            os.path.join(output_root,
                         checkpoint_list[index] + '/*best*.pth.tar'))[0]
        model = create_model('%s' % model_name.split('-')[-2],
                             num_classes=args.num_classes,
                             in_chans=3,
                             checkpoint_path='%s' % checkpoint)
        model = model.cuda()
        model.eval()
            counter.update(tokens)

        sort_by_counts = sorted(counter.items(), key=lambda x: x[1])
        words, counts = zip(*sort_by_counts)

        word2idx = dict(zip(words, range(1, len(words) + 1)))
        return word2idx, max_len


    def fit(self, X, *_):
        self.word2idx, self.sent_size = self.build_vocab(X)
        return self

    def transform(self, X, *_):
        vec = np.zeros((len(X), self.sent_size + 25))
        for i, sent in enumerate(X):
            tokens = self.TK.tokenize(sent, lowercase=True)
            for j, tok in enumerate(tokens):
                vec[i][j] = self.word2idx[tok]
            return vec


if __name__ == '__main__':
    from src.data.dataset import Dataset
    trainset = Dataset().train_set()
    WI = WordIndexer()
    test = WI.fit_transform(trainset['Utterance'])
    print(test[0])
    print(len(WI.word2idx))
    print(WI.sent_size)
def main(params):

    # initialize the experiment
    logger = initialize_exp(params)

    # generate parser / parse parameters
    parser = get_parser()
    params = parser.parse_args()
    reloaded = torch.load(params.model_path)
    model_params = AttrDict(reloaded['params'])
    logger.info("Supported languages: %s" % ", ".join(model_params.lang2id.keys()))

    # update dictionary parameters
    for name in ['n_words', 'bos_index', 'eos_index', 'pad_index', 'unk_index', 'mask_index']:
        setattr(params, name, getattr(model_params, name))

    # build dictionary / build encoder / build decoder / reload weights
    dico = Dictionary(reloaded['dico_id2word'], reloaded['dico_word2id'], reloaded['dico_counts'])
    encoder = TransformerModel(model_params, dico, is_encoder=True, with_output=True).cuda().eval()
    decoder = TransformerModel(model_params, dico, is_encoder=False, with_output=True).cuda().eval()
    encoder.load_state_dict(reloaded['encoder'])
    decoder.load_state_dict(reloaded['decoder'])
    params.src_id = model_params.lang2id[params.src_lang]
    params.tgt_id = model_params.lang2id[params.tgt_lang]

    # float16
    if params.fp16:
        assert torch.backends.cudnn.enabled
        encoder = network_to_half(encoder)
        decoder = network_to_half(decoder)

    input_data = torch.load(params.input)
    eval_dataset = Dataset(input_data["sentences"], input_data["positions"], params)

    if params.subset_start is not None:
        assert params.subset_end
        eval_dataset.select_data(params.subset_start, params.subset_end)

    eval_dataset.remove_empty_sentences()
    eval_dataset.remove_long_sentences(params.max_len)

    n_batch = 0

    out = io.open(params.output_path, "w", encoding="utf-8")
    inp_dump = io.open(os.path.join(params.dump_path, "input.txt"), "w", encoding="utf-8")
    logger.info("logging to {}".format(os.path.join(params.dump_path, 'input.txt')))

    with open(params.output_path, "w", encoding="utf-8") as out:

        for batch in eval_dataset.get_iterator(shuffle=False):
            n_batch += 1

            (x1, len1) = batch
            input_text = convert_to_text(x1, len1, input_data["dico"], params)
            inp_dump.write("\n".join(input_text))
            inp_dump.write("\n")

            langs1 = x1.clone().fill_(params.src_id)

            # cuda
            x1, len1, langs1 = to_cuda(x1, len1, langs1)

            # encode source sentence
            enc1 = encoder("fwd", x=x1, lengths=len1, langs=langs1, causal=False)
            enc1 = enc1.transpose(0, 1)

            # generate translation - translate / convert to text
            max_len = int(1.5 * len1.max().item() + 10)
            if params.beam_size == 1:
                generated, lengths = decoder.generate(enc1, len1, params.tgt_id, max_len=max_len)
            else:
                generated, lengths = decoder.generate_beam(
                    enc1, len1, params.tgt_id, beam_size=params.beam_size,
                    length_penalty=params.length_penalty,
                    early_stopping=params.early_stopping,
                    max_len=max_len)

            hypotheses_batch = convert_to_text(generated, lengths, input_data["dico"], params)

            out.write("\n".join(hypotheses_batch))
            out.write("\n")

            if n_batch % 100 == 0:
                logger.info("{} batches processed".format(n_batch))

    out.close()
    inp_dump.close()
Exemple #24
0
def main(unused_argv):
    if len(unused_argv) != 1:
        raise Exception('Problem with number of flags entered %s' %
                        unused_argv)

    tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.info('Starting RVAE model in %s mode', (FLAGS.mode))

    if FLAGS.mode == 'train':
        assert FLAGS.eval_path is not None, "Error! Eval path must be provided in 'train' mode. Use train_only for only training"
    elif FLAGS.mode == 'predict':
        FLAGS.batch_size = 1

    # change model_dir to model_dir/exp_name and create dir if needed
    FLAGS.model_dir = os.path.join(FLAGS.model_dir, FLAGS.exp_name)
    if not os.path.exists(FLAGS.model_dir):
        if FLAGS.mode == 'train' or FLAGS.mode == 'save_embed':
            os.makedirs(FLAGS.model_dir)
        else:
            raise Exception(
                "The model_dir specified does not exist. Run in train to create it"
            )
    elif not os.path.exists(FLAGS.vocab_path):
        raise Exception("Path specified for vocab file does not exist")
    elif FLAGS.checkpoint_path and not os.path.exists(FLAGS.checkpoint_path):
        raise Exception("Path for checkpoint does not exist")

    # load vocab and calculate size
    vocab = Vocab(vocab_path=FLAGS.vocab_path)
    vsize = len(vocab.vocab)

    # load the dataset
    ds = Dataset(vocab)

    # create an hps list
    if not FLAGS.app_config or not FLAGS.model_params:
        hp_list = [
            'batch_size', 'emb_dim', 'hidden_dim', 'latent_dim', 'dec_layers',
            'beam_size', 'max_dec_steps', 'lr', 'keep_prob', 'use_wdrop',
            'model_dir', 'vocab_path'
        ]
        hps_dict = {}
        for key in FLAGS:
            if key in hp_list:
                hps_dict[key] = FLAGS[key].value
        hps = namedtuple("HParams", hps_dict.keys())(**hps_dict)
    else:
        # TODO: Fix the yaml file location to use relative pathing so that we can find the config files
        flags = AppConfig('app.yaml', 'default')
        hps = ModelParams('hps.yaml', 'default')

    # call the model
    model = RVAE(hps, vsize)

    if FLAGS.mode == 'train':
        train_and_eval(model, ds, vocab)
    elif FLAGS.mode == 'predict':
        predictions = infer(model, ds, vocab, FLAGS.checkpoint_path)
        print(predictions)
    elif FLAGS.mode == 'eval':
        eval(model, ds, vocab)
    elif FLAGS.mode == 'save_embed':
        _ = vocab.read_embeddings(path=FLAGS.embed_path, load_np=False)
        print("Done saving numpy matrix")
        return
    elif FLAGS.mode == 'debug':
        print("debug")
    else:
        raise Exception("Invalid mode argument")

    return
Exemple #25
0
def run_model(selective_testing, translate, test_language, evaluation_split,
              detailed_report):
    """ Trains the CWI model in all languages but one. Tests on all datasets of
        a particular language. Reports results.

    Args:
        test_language:      The language of the dataset to use for testing.
                            evaluation_split: The split of the data to use for
                            evaluating the performance of the model (dev or
                            test).

        detailed_report:    Whether to display a detailed report or just overall
                            score.

    """

    # collect the training data for all the languages but one
    train_data = []

    if selective_testing == 'ESG':
        for language, datasets_names in datasets_per_language.items():
            if language != test_language:
                for dataset_name in datasets_names:
                    data = Dataset(language, dataset_name)
                    lang_train_set = data.train_set()
                    if lang_train_set is None:
                        print("No training data found for language {}.".format(
                            language))
                    else:
                        train_data.append(lang_train_set)
        train_data = pd.concat(train_data)
    else:
        train_data = pd.DataFrame()
        if 'E' in selective_testing:
            train_data = pd.concat([
                train_data,
                Dataset('english', 'News').train_set(),
                Dataset('english', 'WikiNews').train_set(),
                Dataset('english', 'Wikipedia').train_set()
            ])

        if 'S' in selective_testing:
            train_data = pd.concat(
                [train_data,
                 Dataset('spanish', 'Spanish').train_set()])

        if 'G' in selective_testing:
            train_data = pd.concat(
                [train_data,
                 Dataset('german', 'German').train_set()])


#        if selective_testing == 'ES':
#            train_data = pd.concat([Dataset('english','News').train_set(),
#                                    Dataset('english','WikiNews').train_set(),
#                                    Dataset('english','Wikipedia').train_set(),
#                                    Dataset('spanish','Spanish').train_set()])
#        elif selective_testing == 'EG':
#            train_data = pd.concat([Dataset('english','News').train_set(),
#                                    Dataset('english','WikiNews').train_set()
#                                    ,Dataset('english','Wikipedia').train_set(),
#                                    Dataset('german','German').train_set()])
#        elif selective_testing == 'E':
#            train_data = pd.concat([Dataset('english','News').train_set(),
#                                    Dataset('english','WikiNews').train_set()
#                                    ,Dataset('english','Wikipedia').train_set()])
#        elif selective_testing == 'G':
#            train_data = pd.concat([Dataset('german','German').train_set()])
#
#        elif selective_testing == 'S':
#            train_data = pd.concat([Dataset('spanish','Spanish').train_set()])
#        else:
#            train_data = pd.concat([Dataset('spanish','Spanish').train_set(),
#                                    Dataset('german','German').train_set()])

# train the CWI model
    cwi_model = CrosslingualCWI(list(datasets_per_language.keys()))
    cwi_model.train(train_data)

    # test the model
    test_datasets = datasets_per_language[test_language]

    for dataset_name in test_datasets:
        data = Dataset(test_language, dataset_name)

        print("\nTesting on  {} - {}.".format(test_language, dataset_name))

        if evaluation_split in ["dev", "both"]:
            print("\nResults on Development Data")

            if test_language == 'french':
                print("\nNo Dev Data for French, skipping...")
            else:
                predictions_dev = cwi_model.predict(data.dev_set())
                gold_labels_dev = data.dev_set()['gold_label']
                print(
                    report_binary_score(gold_labels_dev, predictions_dev,
                                        detailed_report))

        if evaluation_split in ["test", "both"]:
            print("\nResults on Test Data")

            data.translate = translate
            predictions_test = cwi_model.predict(data.test_set())
            gold_labels_test = data.test_set()['gold_label']

            print(
                report_binary_score(gold_labels_test, predictions_test,
                                    detailed_report))

    print()