Python Dictionary Examples, data.Dictionary Python Examples

Example #1

0

Show file

    def convert_dictionary(self, dictionary, word_rank):
        rank_dictionary = data.Dictionary()
        rank_dictionary.idx2word = [''] * len(dictionary.idx2word)
        for idx, word in enumerate(dictionary.idx2word):

            rank = word_rank[idx]
            rank_dictionary.idx2word[rank] = word
            if word not in rank_dictionary.word2idx:
                rank_dictionary.word2idx[word] = rank
        return rank_dictionary

Example #2

0

Show file

def evaluate_zero_shot(args, model, tokenizer, path, src_query, trg_query):
    """
    Evaluate the model for a zero-shot classification task
    Return loss and accuracy
    """
    model.eval()
    pred_ls = []
    true_ls = []

    #### Data
    test_src = [line.rstrip('\n') for line in open(path + "/test.src")]
    test_trg = [line.rstrip('\n') for line in open(path + "/test.trg")]

    # Shuffle in case of short eval
    src_shuf = []
    trg_shuf = []
    index_shuf = list(range(len(test_src)))
    shuffle(index_shuf)
    for i in index_shuf:
        src_shuf.append(test_src[i])
        trg_shuf.append(test_trg[i])
    test_src = src_shuf
    test_trg = trg_shuf

    # Targets dictionary
    dictionary = data.Dictionary()
    for l in test_trg:
        dictionary.add_word(l)

    n_samples = len(test_src)
    if args.max_batches is not None and args.max_batches < n_samples:
        n_samples = args.max_batches

    # for i in trange(len(test_src)):
    for i in trange(n_samples):
        src, trg = test_src[i], test_trg[i]
        src += src_query
        # Get context hidden states once to speed up eval
        context = torch.tensor([tokenizer.encode(src)])
        pred, past = model(context)
        mp, true_lbl = most_probable_label(model, trg, trg_query, dictionary,
                                           past, tokenizer)
        pred_ls.append(mp)
        true_ls.append(true_lbl)

    return pred_ls, true_ls

Example #3

0

Show file

###############################################################################
# Load data
###############################################################################

# load train and dev dataset
train_corpus = data.Corpus(args.tokenize, args.max_query_length,
                           args.max_doc_length)
train_corpus.parse(args.data + 'train.txt', max_example=args.max_example)
print('train set size = ', len(train_corpus))
dev_corpus = data.Corpus(args.tokenize, args.max_query_length,
                         args.max_doc_length)
dev_corpus.parse(args.data + 'dev.txt')
print('development set size = ', len(dev_corpus))

dictionary = data.Dictionary()
dictionary.build_dict(train_corpus, args.max_words)
# save the dictionary object to use during testing
helper.save_object(dictionary, args.save_path + 'dictionary.p')
print('vocabulary size = ', len(dictionary))

embeddings_index = helper.load_word_embeddings(args.word_vectors_directory,
                                               args.word_vectors_file,
                                               dictionary.word2idx)
print('number of OOV words = ', len(dictionary) - len(embeddings_index))

# ###############################################################################
# # Build the model
# ###############################################################################

model = NSRF(dictionary, embeddings_index, args)

Example #4

0

Show file

File: main.py Project: yanshanjing/transferable_sent2vec

def main():
    ###############################################################################
    # Load data
    ###############################################################################

    dictionary = data.Dictionary()
    train_corpus = data.Corpus(dictionary)
    dev_corpus = data.Corpus(dictionary)
    test_corpus = data.Corpus(dictionary)

    task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task]
    for task in task_names:
        skip_first_line = True if task == 'sick' else False
        train_corpus.parse(task,
                           args.data,
                           'train.txt',
                           args.tokenize,
                           num_examples=args.max_example,
                           skip_first_line=skip_first_line)
        if task == 'multinli':
            dev_corpus.parse(task, args.data, 'dev_matched.txt', args.tokenize)
            dev_corpus.parse(task, args.data, 'dev_mismatched.txt',
                             args.tokenize)
            test_corpus.parse(task,
                              args.data,
                              'test_matched.txt',
                              args.tokenize,
                              is_test_corpus=False)
            test_corpus.parse(task,
                              args.data,
                              'test_mismatched.txt',
                              args.tokenize,
                              is_test_corpus=False)
        else:
            dev_corpus.parse(task,
                             args.data,
                             'dev.txt',
                             args.tokenize,
                             skip_first_line=skip_first_line)
            test_corpus.parse(task,
                              args.data,
                              'test.txt',
                              args.tokenize,
                              is_test_corpus=False,
                              skip_first_line=skip_first_line)

    print('train set size = ', len(train_corpus.data))
    print('development set size = ', len(dev_corpus.data))
    print('test set size = ', len(test_corpus.data))
    print('vocabulary size = ', len(dictionary))

    # save the dictionary object to use during testing
    helper.save_object(dictionary,
                       args.save_path + args.task + '_dictionary.pkl')

    embeddings_index = helper.load_word_embeddings(args.word_vectors_directory,
                                                   args.word_vectors_file,
                                                   dictionary.word2idx)
    print('number of OOV words = ', len(dictionary) - len(embeddings_index))

    # ###############################################################################
    # # Build the model
    # ###############################################################################

    model = SentenceClassifier(dictionary, embeddings_index, args)
    optim_fn, optim_params = helper.get_optimizer(args.optimizer)
    optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()),
                         **optim_params)
    best_acc = 0

    if args.cuda:
        model = model.cuda()

    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc = checkpoint['best_acc']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # ###############################################################################
    # # Train the model
    # ###############################################################################

    train = Train(model, optimizer, dictionary, embeddings_index, args,
                  best_acc)
    bestmodel = train.train_epochs(train_corpus, dev_corpus, args.start_epoch,
                                   args.epochs)
    test_batches = helper.batchify(test_corpus.data, args.batch_size)
    if 'multinli' in task_names:
        print(
            'Skipping evaluating best model. Evaluate using the test script.')
    else:
        test_accuracy, test_f1 = evaluate(bestmodel, test_batches, dictionary)
        print('accuracy: %.2f%%' % test_accuracy)
        print('f1: %.2f%%' % test_f1)

Example #5

0

Show file

	assert len(args.units_first) == len(args.colors_first), "!!!---Number of colors_first is not equal to number of units_first (1st Layer)---!!!"

gate_names = ['Input', 'Forget', 'Cell', 'Output']
# Parse output dir and file names:
# os.makedirs(os.path.dirname(args.output), exist_ok=True)
dirname = os.path.dirname(args.output)
filename = os.path.basename(args.output)

# Load model
print('Loading models...')
print('\nmodel: ' + args.model+'\n')
model = torch.load(args.model, lambda storage, loc: storage)
model.rnn.flatten_parameters()
embeddings_in = model.encoder.weight.data.cpu().numpy()
embeddings_out = model.decoder.weight.data.cpu().numpy()
vocab = data.Dictionary(args.vocabulary)

# Read list of contrasted words (e.g., singular vs. plural verbs).
with open(args.input, 'r') as f:
	lines=f.readlines()
verbs_singular = [l.split('\t')[0].strip() for l in lines]
verbs_plural = [l.split('\t')[1].strip() for l in lines]
verbs_all = verbs_singular + verbs_plural
print('\nWords used (group 1):')
print(verbs_singular)
print('\nWords used (group 2):')
print(verbs_plural)

# Get index in the vocab for all words and extract embeddings
idx_verbs_singular = [vocab.word2idx[w] for w in verbs_singular]
idx_verbs_plural = [vocab.word2idx[w] for w in verbs_plural]

Example #6

0

Show file

    if not args.cuda:
        print(
            "WARNING: You have a CUDA device, so you should probably run with --cuda"
        )

device = torch.device("cuda" if args.cuda else "cpu")

###############################################################################
# Load data
###############################################################################
dic_exists = os.path.isfile(os.path.join(args.data, 'action_dictionary.pkl'))
if dic_exists:
    with open(os.path.join(args.data, 'action_dictionary.pkl'), 'rb') as input:
        Corpus_Dic = pickle.load(input)
else:
    Corpus_Dic = data.Dictionary()

train_data_name = os.path.join(
    args.data,
    str(args.number_per_class) + '_labeled_train.csv')
test_data_name = os.path.join(args.data, 'test.csv')

train_data = data.Csv_DataSet(train_data_name)
test_data = data.Csv_DataSet(test_data_name)
train_data.load(dictionary=Corpus_Dic)
test_data.load(dictionary=Corpus_Dic, train_mode=False)

# save the dictionary
if not dic_exists:
    with open(os.path.join(args.data, 'action_dictionary.pkl'),
              'wb') as output:

Example #7

0

Show file

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    if torch.cuda.is_available():
        if not args.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda"
            )

    device = torch.device("cuda" if args.cuda else "cpu")

    ###############################################################################
    # Load vocab
    ###############################################################################

    vocab = data.Dictionary()
    with open("../data/train.txt", 'r', encoding="utf8") as f:
        for line in f:
            words = line.split() + ['<eos>']
            for word in words:
                vocab.add_word(word)

    ###############################################################################
    # Build the model
    ###############################################################################

    ntokens = len(vocab)
    forward_model = model.TransformerModel(ntokens, emsize, nhead, nhid,
                                           nlayers, dropout).to(device)
    backward_model = model.TransformerModel(ntokens, emsize, nhead, nhid,
                                            nlayers, dropout).to(device)

Example #8

0

Show file

File: translate2.py Project: risehnhew/An-unsupervised-text-simplification-method

def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='Translate using a pre-trained model')
    parser.add_argument('--model', help='a model previously trained with train.py')
    parser.add_argument('--batch_size', type=int, default=50, help='the batch size (defaults to 50)')
    parser.add_argument('--beam_size', type=int, default=12, help='the beam size (defaults to 12, 0 for greedy search)')
    parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
    parser.add_argument('-i', '--input', default=sys.stdin.fileno(), help='the input file (defaults to stdin)')
    parser.add_argument('-o', '--output', default=sys.stdout.fileno(), help='the output file (defaults to stdout)')
    parser.add_argument('--noise',type=float,default=0.5)
    parser.add_argument('--pass_att',action='store_true',default=False)
    parser.add_argument('--src_embeddings',default=None,help='common intersection source embeddings')
    parser.add_argument('--cutoff', type=int, default=None, help='cutoff for source embeddings above')
    parser.add_argument('--cat_embedds',help='use torch.load to load src and trg ')
    parser.add_argument('--ncontrol',type=int,default=0,help='control number given while using the decoder')
    args = parser.parse_args()
    t = torch.load(args.model)
    # try:
    #     t = torch.load(args.model)
    # except Exception:
    #     # t = torch.load(args.model,map_location={'cuda:1':'cuda:0'})
    #     t = torch.load(args.model,map_location={'cuda:3'})

    # Translate sentences
    end = False
    fin = open(args.input, encoding=args.encoding, errors='surrogateescape')
    fout = open(args.output, mode='w', encoding=args.encoding, errors='surrogateescape')
    if args.src_embeddings is not None:
        encoder_embeddings,src_dictionary = data.read_embeddings(open(args.src_embeddings,'r'),threshold=args.cutoff) 
        encoder_embeddings = gpu(encoder_embeddings)
        t.decoder_embeddings=gpu(t.decoder_embeddings)
        t.generator=gpu(t.generator)
        t.encoder=gpu(t.encoder)
        t.decoder=gpu(t.decoder)

        translator_new = Translator(encoder_embeddings,t.decoder_embeddings,t.generator,src_dictionary,\
        t.trg_dictionary,t.encoder,t.decoder,t.denoising,t.device)
    else:
        t.device=torch.device('cuda')
        t.encoder=gpu(t.encoder)
        t.decoder=gpu(t.decoder)
        t.encoder_embeddings=gpu(t.encoder_embeddings)
        t.decoder_embeddings=gpu(t.decoder_embeddings)
        t.generator=gpu(t.generator)
        t.src_dictionary = data.Dictionary(t.src_dictionary.id2word[1:])
        t.trg_dictionary = data.Dictionary(t.trg_dictionary.id2word[1:])
        translator_new = Translator(t.encoder_embeddings,t.decoder_embeddings,t.generator,t.src_dictionary,\
        t.trg_dictionary,t.encoder,t.decoder,t.denoising,t.device)
    # print (translator_new.denoising)
    # exit(0)
    while not end:
        batch = []
        while len(batch) < args.batch_size and not end:
            line = fin.readline()
            if not line:
                end = True
            else:
                batch.append(line)
        if args.beam_size <= 0 and len(batch) > 0:
            for translation in translator_new.greedy(batch, train=False):
                print(translation, file=fout)
        elif len(batch) > 0:
            translations = translator_new.beam_search(batch, train=False, beam_size=12, max_ratio=2,rnk=6,noiseratio=args.noise,pass_att=args.pass_att,ncontrol=args.ncontrol if args.ncontrol!=0 else None)
            print(translations)
            if args.pass_att:
                for translation1,trans2 in translations:
                    print(translation1,trans2, file=fout)
            else:
                for translation in translations:
                    print(translation, file=fout)
        fout.flush()
    fin.close()
    fout.close()

Example #9

0

Show file

    # chapter = i
    # if chapter < 10:
    # 	# input_data = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/Data/Chapters_Parsed/Test_all/Chapitre0{}.alt.txt'.format(chapter)
    # 	# output = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/patterns/fr/Chap0{}_activations.pkl'.format(chapter)

    # 	input_data = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/Data/en/Chapter0{}.txt'.format(chapter)
    # 	output = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/patterns/en/updated/Chap0{}_activations.pkl'.format(chapter)

    # else:
    # 	# input_data = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/Data/Chapters_Parsed/Test_all/Chapitre{}.alt.txt'.format(chapter)
    # 	# output = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/patterns/fr/Chap{}_activations.pkl'.format(chapter)

    # 	input_data = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/Data/en/Chapter{}.txt'.format(chapter)
    # 	output = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/patterns/en/updated/Chap{}_activations.pkl'.format(chapter)

    vocab = data.Dictionary(vocabulary)
    sentences = []
    print(open(input_data, 'r'))
    for l in open(input_data, 'r'):
        if not l.find("\'") == -1:
            l = l.replace("\'", "\' ")

        sentence = l.rstrip().split(" ")
        sentence = [s.lower() for s in sentence]
        if l[0] != '\n':
            sentences.append(sentence)
    sentences = np.array(sentences)
    print(sentences)
    # sentences = sentences[0:1000]

    print('Loading models...')

Example #10

0

Show file

File: main.py Project: yanshanjing/transferable_sent2vec

def main():
    # if output directory doesn't exist, create it
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    # set the random seed manually for reproducibility.
    numpy.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda"
            )
        else:
            torch.cuda.manual_seed(args.seed)

    print('\ncommand-line params : {0}\n'.format(sys.argv[1:]))
    print('{0}\n'.format(args))

    ###############################################################################
    # Load data
    ###############################################################################

    dictionary = data.Dictionary()
    tasks = []
    train_dict, dev_dict = {}, {}

    if 'quora' in args.task:
        print('**Task name : Quora**')
        # load quora dataset
        quora_train = data.Corpus(args.data, dictionary)
        quora_train.parse('quora/train.txt', 'quora', args.tokenize,
                          args.max_example)
        print('Found {} pairs of train sentences.'.format(len(
            quora_train.data)))

        quora_dev = data.Corpus(args.data, dictionary)
        quora_dev.parse('quora/dev.txt', 'quora', args.tokenize)
        print('Found {} pairs of dev sentences.'.format(len(quora_dev.data)))

        quora_test = data.Corpus(args.data, dictionary)
        quora_test.parse('quora/test.txt', 'quora', args.tokenize)
        print('Found {} pairs of test sentences.'.format(len(quora_test.data)))

        tasks.append(('quora', 2))
        train_dict['quora'] = quora_train
        dev_dict['quora'] = quora_dev

    if 'snli' in args.task:
        print('**Task name : SNLI**')
        # load snli dataset
        snli_train = data.Corpus(args.data, dictionary)
        snli_train.parse('snli/train.txt', 'snli', args.tokenize,
                         args.max_example)
        print('Found {} pairs of train sentences.'.format(len(
            snli_train.data)))

        snli_dev = data.Corpus(args.data, dictionary)
        snli_dev.parse('snli/dev.txt', 'snli', args.tokenize)
        print('Found {} pairs of dev sentences.'.format(len(snli_dev.data)))

        snli_test = data.Corpus(args.data, dictionary)
        snli_test.parse('snli/test.txt', 'snli', args.tokenize)
        print('Found {} pairs of test sentences.'.format(len(snli_test.data)))

        tasks.append(('snli', 3))
        train_dict['snli'] = snli_train
        dev_dict['snli'] = snli_dev

    if 'multinli' in args.task:
        print('**Task name : Multi-NLI**')
        # load multinli dataset
        multinli_train = data.Corpus(args.data, dictionary)
        multinli_train.parse('multinli/train.txt', 'multinli', args.tokenize,
                             args.max_example)
        print('Found {} pairs of train sentences.'.format(
            len(multinli_train.data)))

        multinli_dev = data.Corpus(args.data, dictionary)
        multinli_dev.parse('multinli/dev_matched.txt', 'multinli',
                           args.tokenize)
        multinli_dev.parse('multinli/dev_mismatched.txt', 'multinli',
                           args.tokenize)
        print('Found {} pairs of dev sentences.'.format(len(
            multinli_dev.data)))

        multinli_test = data.Corpus(args.data, dictionary)
        multinli_test.parse('multinli/test_matched.txt', 'multinli',
                            args.tokenize)
        multinli_test.parse('multinli/test_mismatched.txt', 'multinli',
                            args.tokenize)
        print('Found {} pairs of test sentences.'.format(
            len(multinli_test.data)))

        tasks.append(('multinli', 3))
        train_dict['multinli'] = multinli_train
        dev_dict['multinli'] = multinli_dev

    if 'allnli' in args.task:
        print('**Task name : AllNLI**')
        # load allnli dataset
        allnli_train = data.Corpus(args.data, dictionary)
        allnli_train.parse('snli/train.txt', 'snli', args.tokenize,
                           args.max_example)
        allnli_train.parse('multinli/train.txt', 'multinli', args.tokenize,
                           args.max_example)
        print('Found {} pairs of train sentences.'.format(
            len(allnli_train.data)))

        allnli_dev = data.Corpus(args.data, dictionary)
        allnli_dev.parse('snli/dev.txt', 'snli', args.tokenize)
        allnli_dev.parse('multinli/dev_matched.txt', 'multinli', args.tokenize)
        allnli_dev.parse('multinli/dev_mismatched.txt', 'multinli',
                         args.tokenize)
        print('Found {} pairs of dev sentences.'.format(len(allnli_dev.data)))

        allnli_test = data.Corpus(args.data, dictionary)
        allnli_test.parse('snli/test.txt', 'snli', args.tokenize)
        allnli_test.parse('multinli/test_matched.txt', 'multinli',
                          args.tokenize)
        allnli_test.parse('multinli/test_mismatched.txt', 'multinli',
                          args.tokenize)
        print('Found {} pairs of test sentences.'.format(len(
            allnli_test.data)))

        tasks.append(('allnli', 3))
        train_dict['allnli'] = allnli_train
        dev_dict['allnli'] = allnli_dev

    print('\nvocabulary size = ', len(dictionary))

    # save the dictionary object to use during testing
    helper.save_object(dictionary, args.save_path + 'dictionary.p')

    embeddings_index = helper.load_word_embeddings(args.word_vectors_directory,
                                                   args.word_vectors_file,
                                                   dictionary.word2idx)
    print('number of OOV words = ', len(dictionary) - len(embeddings_index))

    # ###############################################################################
    # # Build the model
    # ###############################################################################

    if not tasks:
        return

    model = MultitaskDomainAdapter(dictionary, embeddings_index, args, tasks)
    print(model)

    optim_fn, optim_params = helper.get_optimizer(args.optimizer)
    optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()),
                         **optim_params)
    best_accuracy = 0

    # for training on multiple GPUs. use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use
    if 'CUDA_VISIBLE_DEVICES' in os.environ:
        cuda_visible_devices = [
            int(x) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',')
        ]
        if len(cuda_visible_devices) > 1:
            model = torch.nn.DataParallel(model,
                                          device_ids=cuda_visible_devices)
    if args.cuda:
        model = model.cuda()

    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_accuracy = checkpoint['best_acc']
            model.load_state_dict(checkpoint['state_dict']['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # ###############################################################################
    # # Train the model
    # ###############################################################################

    train = Train(model, optimizer, dictionary, embeddings_index, args,
                  best_accuracy)
    train.set_train_dev_corpus(train_dict, dev_dict)
    train.train_epochs(args.start_epoch, args.epochs)