def __init__(self, gpu): SeqIndexerBase.__init__(self, gpu=gpu, check_for_lowercase=False, zero_digits=False, pad=None, unk=None, load_embeddings=False, verbose=True)
def __init__(self, gpu, check_for_lowercase, zero_digits, pad, unk, load_embeddings, embeddings_dim, verbose, isElmo=False, isBert=False): SeqIndexerBase.__init__(self, gpu, check_for_lowercase, zero_digits, pad, unk, load_embeddings, embeddings_dim, verbose, isElmo, isBert)
data_io = DataIOFactory.create(args) word_sequences_train, tag_sequences_train, word_sequences_dev, tag_sequences_dev, word_sequences_test, tag_sequences_test = data_io.read_train_dev_test( args) # DatasetsBank provides storing the different dataset subsets (train/dev/test) and sampling batches datasets_bank = DatasetsBankFactory.create(args) datasets_bank.add_train_sequences(word_sequences_train, tag_sequences_train) datasets_bank.add_dev_sequences(word_sequences_dev, tag_sequences_dev) datasets_bank.add_test_sequences(word_sequences_test, tag_sequences_test) # Word_seq_indexer converts lists of lists of words to lists of lists of integer indices and back if args.word_seq_indexer is not None and isfile(args.word_seq_indexer): word_seq_indexer = torch.load(args.word_seq_indexer) # if we use elmo it is not nessesary to use castom word_seq_indexer TODO: write up ELMO to seq_indexer class elif args.isElmo: word_seq_indexer = SeqIndexerBase() else: word_seq_indexer = SeqIndexerWord( gpu=args.gpu, check_for_lowercase=args.check_for_lowercase, embeddings_dim=args.emb_dim, verbose=True) word_seq_indexer.load_items_from_embeddings_file_and_unique_words_list( emb_fn=args.emb_fn, emb_delimiter=args.emb_delimiter, emb_load_all=args.emb_load_all, unique_words_list=datasets_bank.unique_words_list) if args.word_seq_indexer is not None and not isfile(args.word_seq_indexer): torch.save(word_seq_indexer, args.word_seq_indexer) # Tag_seq_indexer converts lists of lists of tags to lists of lists of integer indices and back tag_seq_indexer = SeqIndexerTag(gpu=args.gpu)