def load_dataset(path): charset = Charset() vocab = Vocabulary() vocab.load(f"{path}/vocab.txt") tag_set = Index() tag_set.load(f"{path}/tag2id.txt") measure_type = get_measure_type(path) tag_set = Index() if measure_type == "relations": tag_set.load(f"{path}/tag2id.txt") elif measure_type == "entities": tag_set.load(f"{path}/entity_labels.txt") helper = Helper(vocab, tag_set, charset, measure_type=measure_type) # relation_labels = Index() # relation_labels.load(f"{path}/relation_labels.txt") train_data = load(f"{path}/train.pk")[:1000] test_data = load(f"{path}/test.pk") word_embeddings = np.load(f"{path}/word2vec.vectors.npy") return helper, word_embeddings, train_data, test_data, tag_set
def __init__(self, path, subset = 'train.txt', index = None, seqlen = 35, skip = 35): super(TokenSequence, self).__init__(seqlen, skip) self.path = path self.subset = subset self.file = os.path.join(self.path, self.subset) self.index = index if index is not None else Index() self.data = self.load()
def load(self, skipheader = True, nlines = sys.maxsize, normalize = False): self.index = Index() print('Loading embedding from %s' % self.file) data_ = [] with open(self.file, 'r', encoding='utf-8', errors='ignore') as f: if skipheader: f.readline() for i, line in enumerate(f): if i >= nlines: break try: line = line.strip() splits = line.split(self.separator) word = splits[0] if self.index.hasWord(word): continue coefs = np.array(splits[1:self.vdim+1], dtype=np.float32) if normalize: length = np.linalg.norm(coefs) if length == 0: length += 1e-6 coefs = coefs / length if coefs.shape != (self.vdim,): continue idx = self.index.add(word) data_.append(coefs) assert idx == len(data_) except Exception as err: print('Error in line %d' % i, sys.exc_info()[0], file = sys.stderr) print(' ', err, file = sys.stderr) continue self.data = np.array(data_, dtype = np.float32) del data_ return self
def __init__(self, path=None, lang='en', nlines=None, maxseqlen=None, index=None, nbos=0, neos=1, posiindex=None, classindex=None, bert_model='bert-base-uncased', maxseqlen_bert=None, cache_device_tensors=True): super(LttcDataset, self).__init__() self.path = path self.maxseqlen = maxseqlen self.nbos = max(0, nbos) self.neos = max(1, neos) self.index = index if index is not None else Index() self.padidx = self.index.add('<pad>') self.bosidx = self.index.add('<s>') self.eosidx = self.index.add('</s>') self.index.unkindex = self.index.add('<unk>') self.classindex = classindex if classindex is not None else Index() self.classindex.unkindex = 0 self.posiindex = posiindex if posiindex is not None else Index() self.nlines = nlines self.device = torch.device('cpu') self.lang = lang self.spacy_model = importSpacy(self.lang) self.bert_tokenizer = BertTokenizer.from_pretrained( bert_model, do_lower_case='uncased' in bert_model) if isinstance( bert_model, str) else bert_model self.maxseqlen_bert = maxseqlen_bert if maxseqlen_bert else self.bert_tokenizer.max_len self.samples = pandas.DataFrame(columns=[ 'id', 'filename', 'rawdata', 'spacydata', 'spacy_to_bert_position', 'seq', 'seq_bert', 'seqlen', 'seqlen_bert', 'seq_recon', 'pseq', 'pseq_rev', 'label', 'labelid' ]) self.tensor_cache = [] if cache_device_tensors else None
def loadData(args): ''' ''' __SequenceDataset = data.CharSequence if args.chars else data.TokenSequence print(__SequenceDataset.__name__) index = Index(initwords = ['<unk>'], unkindex = 0) train_ = __SequenceDataset(args.data, subset='train.txt', index = index, seqlen = args.bptt, skip = args.bptt).to(args.device) index.freeze(silent = True).tofile(os.path.join(args.data, 'vocab_chars.txt' if args.chars else 'vocab_tokens.txt')) test_ = __SequenceDataset(args.data, subset='test.txt', index = index, seqlen = args.bptt, skip = args.bptt).to(args.device) valid_ = __SequenceDataset(args.data, subset='valid.txt', index = index, seqlen = args.bptt, skip = args.bptt).to(args.device) # load pre embedding if args.init_weights: # determine type of embedding by checking it's suffix if args.init_weights.endswith('bin'): preemb = FastTextEmbedding(args.init_weights, normalize = True).load() if args.emsize != preemb.dim(): raise ValueError('emsize must match embedding size. Expected %d but got %d)' % (args.emsize, preemb.dim())) elif args.init_weights.endswith('txt'): preemb = TextEmbedding(args.init_weights, vectordim = args.emsize).load(normalize = True) elif args.init_weights.endswith('rand'): preemb = RandomEmbedding(vectordim = args.emsize) else: raise ValueError('Type of embedding cannot be inferred.') preemb = Embedding.filteredEmbedding(index.vocabulary(), preemb, fillmissing = True) preemb_weights = torch.Tensor(preemb.weights) else: preemb_weights = None eval_batch_size = 10 __ItemSampler = RandomSampler if args.shuffle_samples else SequentialSampler __BatchSampler = BatchSampler if args.sequential_sampling else EvenlyDistributingSampler train_loader = torch.utils.data.DataLoader(train_, batch_sampler = ShufflingBatchSampler(__BatchSampler(__ItemSampler(train_), batch_size=args.batch_size, drop_last = True), shuffle = args.shuffle_batches, seed = args.seed), num_workers = 0) test_loader = torch.utils.data.DataLoader(test_, batch_sampler = __BatchSampler(__ItemSampler(test_), batch_size=eval_batch_size, drop_last = True), num_workers = 0) valid_loader = torch.utils.data.DataLoader(valid_, batch_sampler = __BatchSampler(__ItemSampler(valid_), batch_size=eval_batch_size, drop_last = True), num_workers = 0) print(__ItemSampler.__name__) print(__BatchSampler.__name__) print('Shuffle training batches: ', args.shuffle_batches) setattr(args, 'index', index) setattr(args, 'ntokens', len(index)) setattr(args, 'trainloader', train_loader) setattr(args, 'testloader', test_loader) setattr(args, 'validloader', valid_loader) setattr(args, 'preembweights', preemb_weights) setattr(args, 'eval_batch_size', eval_batch_size) return args
def filteredEmbedding(vocabulary, embedding, fillmissing = True): index = Index() weights = [] if fillmissing: rv = RandomEmbedding(embedding.dim()) for w in vocabulary: if index.hasWord(w): continue if embedding.containsWord(w): index.add(w) weights.append(embedding.getVector(w)) elif fillmissing: index.add(w) weights.append(rv.getVector(w)) weights = np.array(weights, dtype = np.float32) return Embedding(weights, index)
def load(self, skipheader = True, nlines = sys.maxsize, normalize = False): self.index = Index() print('Loading embedding from %s' % self.file) data_ = [] with open(self.file, 'r', encoding='utf-8', errors='ignore') as f: if skipheader: f.readline() for i, line in enumerate(f): if i >= nlines: break try: line = line.strip() splits = line.split(self.separator) word = splits[0] if self.index.hasWord(word): continue coefs = np.array(splits[1:self.vdim+1], dtype=np.float32) if normalize: length = np.linalg.norm(coefs) if length == 0: length += 1e-6 coefs = coefs / length if coefs.shape != (self.vdim,): continue idx = self.index.add(word) data_.append(coefs) assert idx == len(data_) except Exception as err: print('Error in line %d' % i, sys.exc_info()[0], file = sys.stderr) print(' ', err, file = sys.stderr) continue self.data = np.array(data_, dtype = np.float32) del data_ print('Building faiss index...') if not self.normalize: print('Attention, normlization of vectors is required to guarantee functional search behaviour. Be sure your vectors are normalized, otherwise declare normlaize flag!') self.invindex = faiss.IndexFlatL2(self.vdim) self.invindex.add(self.data) print('Faiss index built:', self.invindex.is_trained) return self
def __init__(self, path, subset = 'train.txt', nlines=None, maxseqlen=None, maxentlen=None, maxdist=60, nbos = 0, neos = 1, index = None, posiindex = None, classindex = None, rclassindex = None, dclassindex = None, eclassindex = None, compact=True): self.path = path self.subset = subset self.maxseqlen = maxseqlen self.maxdist = maxdist self.nbos = max(0, nbos) self.neos = max(1, neos) self.index = index if index is not None else Index() self.bosidx = self.index.add('<s>') self.eosidx = self.index.add('</s>') self.padidx = self.index.add('<pad>') self.epadidx = self.index.add('<epad>') self.classindex = classindex if classindex is not None else Index() self.rclassindex = rclassindex if rclassindex is not None else Index() self.dclassindex = dclassindex if dclassindex is not None else Index() self.eclassindex = eclassindex if eclassindex is not None else Index() self.posiindex = posiindex if eclassindex is not None else Index(initwords = [ maxdist, -maxdist ], unkindex = 0) self.maxentlen = maxentlen self.load(nlines, compact) self.device = torch.device('cpu') self.deviceTensor = torch.LongTensor().to(self.device) # create tensor on device, which can be used for copying
# Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) device = torch.device("cuda" if args.cuda else "cpu") ############################################################################### # Load data ############################################################################### __SequenceDataset = CharSequence if args.chars else TokenSequence print(__SequenceDataset.__name__) index = Index(initwords=['<unk>'], unkindex=0) train_ = __SequenceDataset(args.data, subset='train.txt', index=index, seqlen=args.bptt, skip=args.bptt).to(device) index.freeze(silent=True).tofile( os.path.join(args.data, 'vocab_chars.txt' if args.chars else 'vocab_tokens.txt')) test_ = __SequenceDataset(args.data, subset='test.txt', index=index, seqlen=args.bptt, skip=args.bptt).to(device) valid_ = __SequenceDataset(args.data, subset='valid.txt',
def __init__(self, vectordim = 300): self.index = Index() self.vdim = vectordim self.data = np.zeros((0, self.vdim), dtype = np.float32) self.invindex = None