Esempio n. 1
0
 def process(dataset, file_names=None):
     files_counter = {}
     input_path = f'data/raw/{dataset}/'
     output_path = f'data/clean/{dataset}/'
     for inp_file, out_file in list_files(input_path, output_path,
                                          file_names):
         with open(inp_file) if not args.byte else open(inp_file,
                                                        'rb') as ifile:
             data = getattr(sys.modules[__name__], dataset)(ifile)
             data.to_file(out_file)
             files_counter[basename(out_file)] = data.qids
     # logging.info(files_counter)
     pickle.dump(files_counter,
                 open(create_path(f'data/info/{dataset}/qids.pkl'), 'wb'))
Esempio n. 2
0
def trainer(name, config, dataset):
    if name is None:
        experiment_path = train_utils.timestamp_dir("results")
    else:
        experiment_path = f'{name}'

    with open(utils.create_path(f'{experiment_path}/config.json'),
              'w') as conff:
        json.dump(config, conff)
    logging.info(f'saving experiment in: {experiment_path}')
    train_utils.set_seed(config['seed'])
    logging.info('Loading embeddings..')

    vocab = Embeddings(f"data/embs/{dataset}/{config['embeddings']}.txt")
    logging.info('Initializing Net..')
    device = 'cuda'
    model = models.Model.by_name(config['model']['name'])(
        config['model']['params'], vocab, device).to(device)

    text_parser = models.Parser.by_name(config['parser'])(vocab)

    train_model = models.Trainer.by_name(config['train_as'])(text_parser,
                                                             model)

    train_data = QAdataset(f'data/parsed/{dataset}/train.json')
    valid_data = QAdataset(f'data/parsed/{dataset}/dev.json')
    test_data = QAdataset(f'data/parsed/{dataset}/test.json')

    optimizer = getattr(torch.optim, config['optimizer']['name'])(
        model.trainable_parameters(), **config['optimizer']['params'])
    train_model.fit(
        train_data,
        optimizer,
        validation=valid_data,
        save_point=f"{experiment_path}/{config['model']['name']}_test.pt",
        patience=config['patience'],
        batch_size=config['batch_size'],
        intervals=100)

    valid_pred = train_model.predict(valid_data)
    valid_pred.to_file(f"{experiment_path}/dev.json")
    test_pred = train_model.predict(test_data)
    test_pred.to_file(f"{experiment_path}/test.json")

    valid_metrics = reranking.evaluate(valid_pred, 0.5)
    test_metrics = reranking.evaluate(test_pred, 0.5)

    with open(f"{experiment_path}/valid_metrics_0.5.json", 'w') as f:
        json.dump(valid_metrics, f)
    with open(f"{experiment_path}/test_metrics_0.5.json", 'w') as f:
        json.dump(test_metrics, f)

    logging.info(
        f'Results on the valid set at treshold 0.5:\n{train_utils.print_metrics(valid_metrics)}'
    )
    logging.info(
        f'Results on the test set at treshold 0.5: \n{train_utils.print_metrics(test_metrics)}'
    )

    max_f1 = 0
    best_th = 0
    for i in range(1, 100):
        th = 1 / 100 * i
        f1 = reranking.f1(valid_pred, th)
        if f1 > max_f1:
            max_f1 = f1
            best_th = th

    valid_metrics = reranking.evaluate(valid_pred, best_th)
    test_metrics = reranking.evaluate(test_pred, best_th)

    with open(f"{experiment_path}/valid_metrics_best.json", 'w') as f:
        json.dump(valid_metrics, f)
    with open(f"{experiment_path}/test_metrics_best.json", 'w') as f:
        json.dump(test_metrics, f)

    logging.info(
        f'Results on the validation set at treshold {best_th}:\n{train_utils.print_metrics(valid_metrics)}'
    )
    logging.info(
        f'Results on the test set at treshold {best_th}: \n{train_utils.print_metrics(test_metrics)}'
    )
Esempio n. 3
0
 def to_file(self, filename):
     with open(create_path(filename), 'w') as out:
         out.writelines(self.iterator(lambda x: f"{x.to_json()}\n"))
Esempio n. 4
0
    parser.add_argument("--only_train",
                        dest='onlytrain',
                        help="lowercased",
                        action='store_true')
    parser.add_argument('--lower',
                        dest='lower',
                        help="lowercased",
                        action='store_true')
    parser.add_argument('--top_n',
                        dest='n',
                        help="the max number of words to keep",
                        type=int)
    args = parser.parse_args()

    def process(dataset, lower=True):
        datasets = []
        inp_path = f'data/parsed/{dataset}/'
        for inp_file, _ in list_files(inp_path, inp_path):
            if not args.onlytrain or inp_file.endswith('train.json'):
                datasets.append(QAdataset(inp_file))
        return Counter(
            chain(*(giff_words(dataset, lower) for dataset in datasets)))

    vocabulary = process(args.dataset, args.lower)
    with open(create_path(f'data/info/{args.dataset}/vocab.tsv'),
              'w') as ofile:
        top_n = vocabulary.most_common(args.n)
        logging.info(f'{args.dataset}: {len(top_n)} words')
        for word, freq in vocabulary.most_common(args.n):
            ofile.write(f'{word}\t{freq}\n')
Esempio n. 5
0
        logging.info(f'Stats Mean {mean} Std {std} Dim {dim}')
        logging.info(f'Original Vocab: {len(vocab)}')
        logging.info(f'Embedding Vocab: {len(w2v.vocab)}')
        filtered_w2v = dict(filter_embeddings(vocab, w2v))
        logging.info(f'Filtered Vocab: {len(filtered_w2v)}')

        filtered_w2v['PAD'] = np.zeros((dim, ))
        filtered_w2v['UNK'] = np.random.normal(mean, std, (dim, ))
    else:
        vocab_set = set(word for word, _ in vocab)
        w2v = load_w2v_fast(f'embs/{args.embeddings}', vocab_set)

        mean, std, dim = emb_stats(w2v)
        logging.info(f'Stats Mean {mean} Std {std} Dim {dim}')
        logging.info(f'Original Vocab: {len(vocab)}')
        logging.info(f'Embedding Vocab: {len(w2v)}')
        filtered_w2v = dict(filter_embeddings(vocab, w2v))
        logging.info(f'Filtered Vocab: {len(filtered_w2v)}')

        filtered_w2v['PAD'] = np.zeros((dim, ))
        filtered_w2v['UNK'] = np.random.normal(mean, std, (dim, ))

    logging.info('Saving file...')
    with open(
            create_path(
                f'data/embs/{args.dataset}/{args.embeddings[:-4]}.txt'),
            'w') as ofile:
        ofile.write(f'{len(filtered_w2v)} {dim}\n')
        for word, emb in filtered_w2v.items():
            ofile.write(f"{word} {' '.join(str(val) for val in emb)}\n")