def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" super(Dataset, self).__init__(*args, **kwargs) self.vocabs = vocabs self.train_domains_set = set( self.train_domains.split(',') ) if self.train_domains != '-' and self.name == "Trainset" else set() print("Loading training data from domains:", self.train_domains_set if self.train_domains_set else "all") self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.rebucket() if self.use_elmo: from lib.models import ElmoLSTMEncoder with tf.variable_scope(tf.get_variable_scope(), reuse=(self.name != "Trainset")): self.elmo_encoder = ElmoLSTMEncoder(self) self.inputs = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='inputs') self.targets = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='targets') self.step = tf.placeholder_with_default(0., shape=None, name='step') self.builder = builder()
def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" self.forest_file_name = kwargs.pop("forest_file", None) if self.forest_file_name is not None: print("[tlog] self.forest_file_name: " + self.forest_file_name) super(Dataset, self).__init__(*args, **kwargs) self.vocabs = vocabs self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._forest_data = self.load_forest_file(self.forest_file_name) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.rebucket() self.inputs = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='inputs') self.targets = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='targets') self.in_neighbor = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='in_neighbor') # [batch, word, neigh] self.in_neighbor_rel = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='in_neighbor_rel') # [batch, word, neigh] self.in_neighbor_mask = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='in_neighbor_mask') # [batch, word, neigh] self.out_neighbor = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='out_neighbor') # [batch, word, neigh] self.out_neighbor_rel = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='out_neighbor_rel') # [batch, word, neigh] self.out_neighbor_mask = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='out_neighbor_mask') # [batch, word, neigh] self.builder = builder()
def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" super(Dataset, self).__init__(*args, **kwargs) self.vocabs = vocabs self.train_domains_set = set( self.train_domains.split(',') ) if self.train_domains != '-' and self.name == "Trainset" else set() print("Loading training data from domains:", self.train_domains_set if self.train_domains_set else "all") self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.rebucket() self.inputs = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='inputs') self.targets = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='targets') self.step = tf.placeholder_with_default(0., shape=None, name='step') self.builder = builder()
def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" super(Dataset, self).__init__(*args, **kwargs) self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.vocabs = vocabs self.rebucket() self.inputs = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='inputs') self.targets = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='targets') self.builder = builder()
def rebucket(self): """""" buff = next(self._file_iterator) # print(f"Dataset.rebucket {buff}") len_cntr = Counter() for sent in buff: len_cntr[len(sent)] += 1 n_bkts = self.n_bkts if len(len_cntr) >= self.n_bkts else len(len_cntr) # print(f"Dataset.rebucket n_bkts: {n_bkts}, {self.n_bkts}") self._metabucket = Metabucket(self._config, n_bkts=n_bkts) splits = KMeans(n_bkts, len_cntr).splits # print(f"Dataset.rebucket splits: {splits}") self.reset(splits) for sent in buff: self._metabucket.add(sent) self._finalize() return
class Dataset(Configurable): """""" # ============================================================= def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" super(Dataset, self).__init__(*args, **kwargs) self.vocabs = vocabs self.train_domains_set = (set(self.train_domains.split(',')) if (self.train_domains != '-' and self.name == "Trainset") else set()) # print("Loading training data from domains:", self.train_domains_set # if self.train_domains_set else "all") self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.reset([]) if filename: self.rebucket() if self.use_elmo: from lib.models import ElmoLSTMEncoder with tf.compat.v1.variable_scope(tf.get_variable_scope(), reuse=(self.name != "Trainset")): self.elmo_encoder = ElmoLSTMEncoder(self) self.inputs = tf.compat.v1.placeholder(dtype=tf.int32, shape=(None, None, None), name='inputs') self.targets = tf.compat.v1.placeholder(dtype=tf.int32, shape=(None, None, None), name='targets') self.step = tf.compat.v1.placeholder_with_default(0., shape=None, name='step') self.builder = builder() # ============================================================= def file_iterator(self, filename): """""" if not filename: yield [[]] # print(f"Dataset.file_iterator {filename}") with open(filename) as f: if self.lines_per_buffer > 0: buff = [[]] while True: line = f.readline() while line: # print(f"Dataset.file_iterator line: {line}") line = line.strip().split() if line and (not self.train_domains_set or line[0].split('/')[0] in self.train_domains): buff[-1].append(line) else: if len(buff) < self.lines_per_buffer: if len(buff[-1]) > 0: buff.append([]) else: buff[-1] = [] else: break line = f.readline() if not line: f.seek(0) else: buff = self._process_buff(buff) yield buff line = line.strip().split() if line: buff = [[line]] else: buff = [[]] else: buff = [[]] for line in f: line = line.strip().split() # print(f"Dataset.file_iterator line: {line}") if line and (not self.train_domains_set or line[0].split('/')[0] in self.train_domains): buff[-1].append(line) else: if len(buff[-1]) > 0: buff.append([]) else: buff[-1] = [] if buff[-1] == []: buff.pop() buff = self._process_buff(buff) while True: yield buff # ============================================================= def _process_buff(self, buff): """""" # tmp_f = open("debug_data_%s" % self.name, 'w') words, tags, rels, srls, predicates, domains = self.vocabs srl_start_field = srls.conll_idx[0] sents = 0 toks = 0 examples = 0 total_predicates = 0 buff2 = [] for i, sent in enumerate(buff): sents += 1 sent_len = len(sent) num_fields = len(sent[0]) srl_take_indices = [ idx for idx in list( range(srl_start_field, srl_start_field + sent_len)) if idx < num_fields - 1 and (self.train_on_nested or np.all( ['/' not in sent[j][idx] for j in list(range(sent_len))])) ] predicate_indices = [] for j, token in enumerate(sent): toks += 1 if self.conll: word, tag1, tag2, head, rel = (token[words.conll_idx], token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx]) if rel == 'root': head = j else: head = int(head) - 1 buff[i][j] = ((word, ) + words[word] + tags[tag1] + tags[tag2] + (head, ) + rels[rel]) elif self.conll2012: # print(f"Dataset._process_buff token {j}:{token}") word, auto_tag, gold_tag, head, rel = ( token[words.conll_idx], token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx]) # print(f"Dataset token read {word}, {auto_tag}, # {gold_tag}, {head}, {rel}") domain = token[0].split('/')[0] # print(word, tag1, tag2, head, rel) if rel == 'root': head = j else: head = int(head) - 1 # srl_fields = [token[idx] if idx < len(token)-1 else 'O' # for idx in list(range(srl_start_field, srl_start_field # + sent_len)) # todo can we use fancy indexing here? srl_fields = [token[idx] for idx in srl_take_indices] srl_fields += ['O'] * (sent_len - len(srl_take_indices)) srl_tags = [srls[s][0] for s in srl_fields] if self.joint_pos_predicates: is_predicate = (token[predicates.conll_idx[0]] != '-' and (self.train_on_nested or self.predicate_str in srl_fields)) tok_predicate_str = str(is_predicate) + '/' + gold_tag else: is_predicate = (token[predicates.conll_idx] != '-' and (self.train_on_nested or self.predicate_str in srl_fields)) tok_predicate_str = str(is_predicate) if is_predicate: predicate_indices.append(j) buff[i][j] = ((word, ) + words[word] + tags[auto_tag] + predicates[tok_predicate_str] + domains[domain] + (sents, ) + tags[gold_tag] + (head, ) + rels[rel] + tuple(srl_tags)) # print(f"Dataset buff[{i}][{j}] = {buff[i][j]}") # print(f"Dataset buff {word}, {word}, {auto_tag}, # {tok_predicate_str}, {domain}, {sents}, {gold_tag}, # {head}, {rel}, {srl_tags}") # Expand sentences into one example per predicate if self.one_example_per_predicate: # grab the sent # should be sent_len x sent_elements sent = np.array(buff[i]) # print(sent) is_predicate_idx = 4 srl_start_idx = 10 word_part = sent[:, 0].astype('O') srl_part = sent[:, srl_start_idx:].astype(np.int32) rest_part = sent[:, 1:srl_start_idx].astype(np.int32) # print("orig sent (%d):" % len(predicate_indices), # sent[:, :8+len(predicate_indices)]) # print("orig preds:", [map(lambda x: srls[int(x)], t) for t # in sent[:, srl_start_idx:srl_start_idx+len( # predicate_indices)]]) if predicate_indices: for k, p_idx in enumerate(predicate_indices): # should be sent_len x sent_elements rest_part[:, is_predicate_idx - 1] = predicates["False"][0] rest_part[p_idx, is_predicate_idx - 1] = predicates["True"][0] correct_srls = srl_part[:, k] new_sent = np.concatenate([ np.expand_dims(word_part, -1), rest_part, np.expand_dims(correct_srls, -1) ], axis=1) buff2.append(new_sent) # print("new sent:", new_sent) # print("new preds:", map(lambda x: srls[int(x)], # new_sent[:, -1])) # tokens_str = ' '.join(word_part) # labels_str = ' '.join(map(lambda x: srls[x], # correct_srls)) # # idx, tokens, labels # print("%d %s ||| %s" % (p_idx, tokens_str, # labels_str), file=tmp_f) total_predicates += 1 examples += 1 else: new_sent = np.concatenate( [np.expand_dims(word_part, -1), rest_part], axis=1) buff2.append(new_sent) examples += 1 # else: # buff2.append(np.concatenate[np.expand_dims(word_part, -1), # rest_part, srl_part], axis=1) #(sent[0],) + map(int, sent[1:])) # examples += 1 # tmp_f.close() if self.one_example_per_predicate: # print("Loaded %d sentences with %d tokens, %d examples # (%d predicates) (%s)" % (sents, toks, examples, # total_predicates, self.name)) return buff2 else: # print(f"Loaded {sents} sentences with {toks} tokens {self.name}") return buff # ============================================================= def reset(self, sizes): """""" self._data = [] self._targets = [] self._metabucket.reset(sizes) return # ============================================================= def rebucket(self): """""" buff = next(self._file_iterator) # print(f"Dataset.rebucket {buff}") len_cntr = Counter() for sent in buff: len_cntr[len(sent)] += 1 n_bkts = self.n_bkts if len(len_cntr) >= self.n_bkts else len(len_cntr) # print(f"Dataset.rebucket n_bkts: {n_bkts}, {self.n_bkts}") self._metabucket = Metabucket(self._config, n_bkts=n_bkts) splits = KMeans(n_bkts, len_cntr).splits # print(f"Dataset.rebucket splits: {splits}") self.reset(splits) for sent in buff: self._metabucket.add(sent) self._finalize() return # ============================================================= def _finalize(self): """""" self._metabucket._finalize() return # ============================================================= def get_minibatches(self, batch_size, input_idxs, target_idxs, shuffle=True): """""" # print(f"Dataset.get_minibatches {batch_size}, {input_idxs}, # {target_idxs}, {shuffle}") minibatches = [] for bkt_idx, bucket in enumerate(self._metabucket): if batch_size == 0: n_splits = 1 else: n_tokens = len(bucket) * bucket.size n_splits = max(n_tokens // batch_size, 1) if shuffle: range_func = np.random.permutation else: range_func = np.arange arr_sp = np.array_split(range_func(len(bucket)), n_splits) for bkt_mb in arr_sp: minibatches.append((bkt_idx, bkt_mb)) if shuffle: np.random.shuffle(minibatches) for bkt_idx, bkt_mb in minibatches: feed_dict = {} data = self[bkt_idx].data[bkt_mb] sents = self[bkt_idx].sents[bkt_mb] try: maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1)) except IndexError as e: print(f"IndexError while enumerating minibatches", file=sys.stderr) raise DatasetError(f"IndexError while enumerating minibatches") # np.set_printoptions(threshold=np.nan) # print("maxlen", maxlen) # print("maxlen+max(target_idxs)", maxlen+max(target_idxs)) # print("data.shape[2]", data.shape[2]) # targets = data[:,:maxlen,min(target_idxs): # maxlen+max(target_idxs)+1] # print("targets shape", targets.shape) # print("data[:,:,3:] shape", targets[:,:,3:].shape) feed_dict.update({ self.inputs: data[:, :maxlen, input_idxs], self.targets: data[:, :maxlen, min(target_idxs):maxlen + max(target_idxs) + 1] }) if self.use_elmo: feed_dict = self.elmo_encoder.get_feed_dict(feed_dict, sents) # print(f"Dataset.get_minibatches yields {feed_dict}") yield feed_dict, sents # ============================================================= @property def n_bkts(self): if self._train: return super(Dataset, self).n_bkts else: return super(Dataset, self).n_valid_bkts # ============================================================= def max_batch_size(self): # print([b._data.shape[0] for b in self._metabucket._buckets]) max_batch_size = np.max( [b._data.shape[0] for b in self._metabucket._buckets]) # print("max batch size: ", max_batch_size) if self.name in ["Testset", "Analyzeset"]: return self.max_test_batch_size elif self.name == "Validset": return self.max_dev_batch_size return max_batch_size # ============================================================= def __getitem__(self, key): return self._metabucket[key] def __len__(self): return len(self._metabucket)
class Dataset(Configurable): """""" #============================================================= def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" super(Dataset, self).__init__(*args, **kwargs) self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.vocabs = vocabs self.rebucket() self.inputs = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='inputs') self.targets = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='targets') self.builder = builder() #============================================================= def file_iterator(self, filename): """""" with open(filename) as f: if self.lines_per_buffer > 0: buff = [[]] while True: line = f.readline() while line: line = line.strip().split() if line: buff[-1].append(line) else: if len(buff) < self.lines_per_buffer: if buff[-1]: buff.append([]) else: break line = f.readline() if not line: f.seek(0) else: buff = self._process_buff(buff) yield buff line = line.strip().split() if line: buff = [[line]] else: buff = [[]] else: buff = [[]] for line in f: line = line.strip().split() if line: buff[-1].append(line) else: if buff[-1]: buff.append([]) if buff[-1] == []: buff.pop() buff = self._process_buff(buff) while True: yield buff #============================================================= def _process_buff(self, buff): """""" words, tags, rels = self.vocabs for i, sent in enumerate(buff): for j, token in enumerate(sent): word, tag1, tag2, head, rel = token[words.conll_idx], token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx] buff[i][j] = (word,) + words[word] + tags[tag1] + tags[tag2] + (int(head),) + rels[rel] sent.insert(0, ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT)) return buff #============================================================= def reset(self, sizes): """""" self._data = [] self._targets = [] self._metabucket.reset(sizes) return #============================================================= def rebucket(self): """""" buff = self._file_iterator.next() len_cntr = Counter() for sent in buff: len_cntr[len(sent)] += 1 self.reset(KMeans(self.n_bkts, len_cntr).splits) for sent in buff: self._metabucket.add(sent) self._finalize() return #============================================================= def _finalize(self): """""" self._metabucket._finalize() return #============================================================= def get_minibatches(self, batch_size, input_idxs, target_idxs, shuffle=True): """""" minibatches = [] for bkt_idx, bucket in enumerate(self._metabucket): if batch_size == 0: n_splits = 1 #elif not self.minimize_pads: # n_splits = max(len(bucket) // batch_size, 1) # if bucket.size > 100: # n_splits *= 2 else: n_tokens = len(bucket) * bucket.size n_splits = max(n_tokens // batch_size, 1) if shuffle: range_func = np.random.permutation else: range_func = np.arange arr_sp = np.array_split(range_func(len(bucket)), n_splits) for bkt_mb in arr_sp: minibatches.append( (bkt_idx, bkt_mb) ) if shuffle: np.random.shuffle(minibatches) for bkt_idx, bkt_mb in minibatches: data = self[bkt_idx].data[bkt_mb] sents = self[bkt_idx].sents[bkt_mb] maxlen = np.max(np.sum(np.greater(data[:,:,0], 0), axis=1)) feed_dict = { self.inputs: data[:,:maxlen,input_idxs], self.targets: data[:,:maxlen,target_idxs] } yield feed_dict, sents #============================================================= def get_minibatches2(self, batch_size, input_idxs, target_idxs): """""" bkt_lens = np.empty(len(self._metabucket)) for i, bucket in enumerate(self._metabucket): bkt_lens[i] = len(bucket) total_sents = np.sum(bkt_lens) bkt_probs = bkt_lens / total_sents n_sents = 0 while n_sents < total_sents: n_sents += batch_size bkt = np.random.choice(self._metabucket._buckets, p=bkt_probs) data = bkt.data[np.random.randint(len(bkt), size=batch_size)] if bkt.size > 100: for data_ in np.array_split(data, 2): feed_dict = { self.inputs: data_[:,:,input_idxs], self.targets: data_[:,:,target_idxs] } yield feed_dict else: feed_dict = { self.inputs: data[:,:,input_idxs], self.targets: data[:,:,target_idxs] } yield feed_dict #============================================================= @property def n_bkts(self): if self._train: return super(Dataset, self).n_bkts else: return super(Dataset, self).n_valid_bkts #============================================================= def __getitem__(self, key): return self._metabucket[key] def __len__(self): return len(self._metabucket)
class Dataset(Configurable): """""" #============================================================= def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" self.forest_file_name = kwargs.pop("forest_file", None) if self.forest_file_name is not None: print("[tlog] self.forest_file_name: " + self.forest_file_name) super(Dataset, self).__init__(*args, **kwargs) self.vocabs = vocabs self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._forest_data = self.load_forest_file(self.forest_file_name) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.rebucket() self.inputs = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='inputs') self.targets = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='targets') self.in_neighbor = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='in_neighbor') # [batch, word, neigh] self.in_neighbor_rel = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='in_neighbor_rel') # [batch, word, neigh] self.in_neighbor_mask = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='in_neighbor_mask') # [batch, word, neigh] self.out_neighbor = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='out_neighbor') # [batch, word, neigh] self.out_neighbor_rel = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='out_neighbor_rel') # [batch, word, neigh] self.out_neighbor_mask = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='out_neighbor_mask') # [batch, word, neigh] self.builder = builder() #============================================================= def file_iterator(self, filename): """""" with open(filename) as f: if self.lines_per_buffer > 0: buff = [[]] while True: line = f.readline() while line: line = line.strip().split() if line: buff[-1].append(line) else: if len(buff) < self.lines_per_buffer: if buff[-1]: buff.append([]) else: break line = f.readline() if not line: f.seek(0) else: buff = self._process_buff(buff) yield buff line = line.strip().split() if line: buff = [[line]] else: buff = [[]] else: buff = [[]] for line in f: line = line.strip().split() if line: buff[-1].append(line) else: if buff[-1]: buff.append([]) if buff[-1] == []: buff.pop() buff = self._process_buff(buff) while True: yield buff #============================================================= def _remove_duplicate_items(self, node_index, neighbor, neighbor_rel, add_self=True, REL_UNK=2): unique_neighbor, unique_neighbor_rel = [], [] node_cache = set() if add_self: unique_neighbor.append(node_index) unique_neighbor_rel.append(REL_UNK) node_cache.add((node_index, REL_UNK)) for i in range(len(neighbor)): if (neighbor[i], neighbor_rel[i]) not in node_cache: unique_neighbor.append(neighbor[i]) unique_neighbor_rel.append(neighbor_rel[i]) node_cache.add((neighbor[i], neighbor_rel[i])) return unique_neighbor, unique_neighbor_rel # ============================================================= def _process_buff(self, buff): """""" words, tags, rels = self.vocabs for i, sent in enumerate(buff): if self.use_forest: sent_str = tuple([token[1] for token in sent]) triples, adj_lists = self._forest_data[len(sent_str)][sent_str] #print("[tlog] adj_lists: " + str(adj_lists)) #sys.exit(0) for j, token in enumerate(sent): word, tag1, tag2, head, rel = token[words.conll_idx], \ token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx] if self.use_forest: #print("[tlog] adj_lists: " + str(adj_lists[0][j + 1]) + "\t" + str(adj_lists[1][j + 1])) unique_in_neighbor, unique_in_neighbor_rel = self._remove_duplicate_items(j+1, adj_lists[0][j+1], adj_lists[1][j+1]) unique_out_neighbor, unique_out_neighbor_rel = self._remove_duplicate_items(j+1, adj_lists[2][j+1], adj_lists[3][j+1]) #print("[tlog] adj_lists: " + str(adj_lists[0][j + 1]) + "\t" + str(adj_lists[1][j + 1])) #sys.exit(0) buff[i][j] = (word,) + words[word] + tags[tag1] + tags[tag2] + (int(head),) + rels[rel] + \ (unique_in_neighbor, unique_in_neighbor_rel, unique_out_neighbor, unique_out_neighbor_rel) else: buff[i][j] = (word,) + words[word] + tags[tag1] + tags[tag2] + (int(head),) + rels[rel] if self.use_forest: unique_in_neighbor, unique_in_neighbor_rel = self._remove_duplicate_items(0, adj_lists[0][0], adj_lists[1][0]) unique_out_neighbor, unique_out_neighbor_rel = self._remove_duplicate_items(0, adj_lists[2][0], adj_lists[3][0]) sent.insert(0, ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT, \ unique_in_neighbor, unique_in_neighbor_rel, unique_out_neighbor, unique_out_neighbor_rel)) else: sent.insert(0, ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT)) return buff # ============================================================= def load_forest_file(self, forest_file_name): if forest_file_name is None or not self.use_forest: return if self.forest_type == 0: return load_nbest(forest_file_name, self.nbest_only_keep, self.vocabs[2]) elif self.forest_type == 1: return load_cube(forest_file_name, self.cube_only_keep) elif self.forest_type == 2: return load_cube(forest_file_name, self.nbest_only_keep) elif self.forest_type == 3: return load_cubesparse(forest_file_name, self.cube_only_keep, self.vocabs[2]) else: print("[Error] forest_type must be in [0, 1, 2]\n " + "\t 0 --- nbest, 10 \n" + "\t 1 --- cube, 0.05 \n" + "\t 2 --- cube, 10 \n" + "\t 3 --- cubesparse, 0.05 \n") sys.exit(0) #============================================================= def reset(self, sizes): """""" self._data = [] self._targets = [] self._metabucket.reset(sizes) return #============================================================= def rebucket(self): """""" buff = self._file_iterator.next() len_cntr = Counter() for sent in buff: len_cntr[len(sent)] += 1 self.reset(KMeans(self.n_bkts, len_cntr).splits) for sent in buff: self._metabucket.add(sent) self._finalize() return #============================================================= def _finalize(self): """""" self._metabucket._finalize() return #============================================================= def get_minibatches(self, batch_size, input_idxs, target_idxs, forest_idxs=None, shuffle=True): """""" minibatches = [] for bkt_idx, bucket in enumerate(self._metabucket): if batch_size == 0: n_splits = 1 else: n_tokens = len(bucket) * bucket.size n_splits = max(n_tokens // batch_size, 1) if shuffle: range_func = np.random.permutation else: range_func = np.arange arr_sp = np.array_split(range_func(len(bucket)), n_splits) for bkt_mb in arr_sp: minibatches.append((bkt_idx, bkt_mb)) if shuffle: np.random.shuffle(minibatches) for bkt_idx, bkt_mb in minibatches: feed_dict = {} data = self[bkt_idx].data[bkt_mb] sents = self[bkt_idx].sents[bkt_mb] maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1)) feed_dict.update({ self.inputs: data[:, :maxlen, input_idxs], self.targets: data[:, :maxlen, target_idxs] }) if self.use_forest and forest_idxs is not None: in_neighbor_data = self[bkt_idx].in_neighbor_data[bkt_mb] in_neighbor_rel_data = self[bkt_idx].in_neighbor_rel_data[bkt_mb] in_neighbor_mask= self[bkt_idx].in_neighbor_mask[bkt_mb] out_neighbor_data = self[bkt_idx].out_neighbor_data[bkt_mb] out_neighbor_rel_data = self[bkt_idx].out_neighbor_rel_data[bkt_mb] out_neighbor_mask = self[bkt_idx].out_neighbor_mask[bkt_mb] feed_dict.update({ self.in_neighbor: in_neighbor_data[:, :maxlen], self.in_neighbor_rel: in_neighbor_rel_data[:, :maxlen], self.in_neighbor_mask: in_neighbor_mask[:, :maxlen], self.out_neighbor: out_neighbor_data[:, :maxlen], self.out_neighbor_rel: out_neighbor_rel_data[:, :maxlen], self.out_neighbor_mask: out_neighbor_mask[:, :maxlen], }) yield feed_dict, sents #============================================================= @property def n_bkts(self): if self._train: return super(Dataset, self).n_bkts else: return super(Dataset, self).n_valid_bkts #============================================================= def __getitem__(self, key): return self._metabucket[key] def __len__(self): return len(self._metabucket)
class Dataset(Configurable): """""" #============================================================= def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" super(Dataset, self).__init__(*args, **kwargs) self.vocabs = vocabs self.train_domains_set = set( self.train_domains.split(',') ) if self.train_domains != '-' and self.name == "Trainset" else set() print("Loading training data from domains:", self.train_domains_set if self.train_domains_set else "all") self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.rebucket() self.inputs = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='inputs') self.targets = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='targets') self.step = tf.placeholder_with_default(0., shape=None, name='step') self.builder = builder() #============================================================= def file_iterator(self, filename): """""" with open(filename) as f: if self.lines_per_buffer > 0: buff = [[]] while True: line = f.readline() while line: line = line.strip().split() if line and (not self.train_domains_set or line[0].split('/')[0] in self.train_domains): buff[-1].append(line) else: if len(buff) < self.lines_per_buffer: if len(buff[-1]) > 0: buff.append([]) else: buff[-1] = [] else: break line = f.readline() if not line: f.seek(0) else: buff = self._process_buff(buff) yield buff line = line.strip().split() if line: buff = [[line]] else: buff = [[]] else: buff = [[]] for line in f: line = line.strip().split() if line and (not self.train_domains_set or line[0].split('/')[0] in self.train_domains): buff[-1].append(line) else: if len(buff[-1]) > 0: buff.append([]) else: buff[-1] = [] if buff[-1] == []: buff.pop() buff = self._process_buff(buff) while True: yield buff #============================================================= def _process_buff(self, buff): """""" # tmp_f = open("debug_data_%s" % self.name, 'w') words, tags, rels, srls, predicates, domains = self.vocabs srl_start_field = srls.conll_idx[0] sents = 0 toks = 0 examples = 0 total_predicates = 0 buff2 = [] for i, sent in enumerate(buff): # if not self.conll2012 or (self.conll2012 and len(list(sent)) > 1): # print(sent, len(sent)) sents += 1 sent_len = len(sent) num_fields = len(sent[0]) srl_take_indices = [ idx for idx in range(srl_start_field, srl_start_field + sent_len) if idx < num_fields - 1 and (self.train_on_nested or np.all( ['/' not in sent[j][idx] for j in range(sent_len)])) ] predicate_indices = [] for j, token in enumerate(sent): toks += 1 if self.conll: word, tag1, tag2, head, rel = token[ words.conll_idx], token[tags.conll_idx[0]], token[ tags.conll_idx[1]], token[6], token[rels.conll_idx] if rel == 'root': head = j else: head = int(head) - 1 buff[i][j] = ( word, ) + words[word] + tags[tag1] + tags[tag2] + ( head, ) + rels[rel] elif self.conll2012: word, auto_tag, gold_tag, head, rel = token[ words.conll_idx], token[tags.conll_idx[0]], token[ tags.conll_idx[1]], token[6], token[rels.conll_idx] domain = token[0].split('/')[0] # print(word, tag1, tag2, head, rel) if rel == 'root': head = j else: head = int(head) - 1 # srl_fields = [token[idx] if idx < len(token)-1 else 'O' for idx in range(srl_start_field, srl_start_field + sent_len)] srl_fields = [token[idx] for idx in srl_take_indices ] # todo can we use fancy indexing here? srl_fields += ['O'] * (sent_len - len(srl_take_indices)) srl_tags = [srls[s][0] for s in srl_fields] if self.joint_pos_predicates: is_predicate = token[ predicates.conll_idx[0]] != '-' and ( self.train_on_nested or self.predicate_str in srl_fields) tok_predicate_str = str(is_predicate) + '/' + gold_tag else: is_predicate = token[predicates.conll_idx] != '-' and ( self.train_on_nested or self.predicate_str in srl_fields) tok_predicate_str = str(is_predicate) if is_predicate: predicate_indices.append(j) buff[i][j] = ( word, ) + words[word] + tags[auto_tag] + predicates[ tok_predicate_str] + domains[domain] + ( sents, ) + tags[gold_tag] + ( head, ) + rels[rel] + tuple(srl_tags) # Expand sentences into one example per predicate if self.one_example_per_predicate: # grab the sent # should be sent_len x sent_elements sent = np.array(buff[i]) # print(sent) is_predicate_idx = 4 srl_start_idx = 10 word_part = sent[:, 0].astype('O') srl_part = sent[:, srl_start_idx:].astype(np.int32) rest_part = sent[:, 1:srl_start_idx].astype(np.int32) # print("orig sent (%d):" % len(predicate_indices), sent[:, :8+len(predicate_indices)]) # print("orig preds:", [map(lambda x: srls[int(x)], t) for t in sent[:, srl_start_idx:srl_start_idx+len(predicate_indices)]]) if predicate_indices: for k, p_idx in enumerate(predicate_indices): # should be sent_len x sent_elements rest_part[:, is_predicate_idx - 1] = predicates["False"][0] rest_part[p_idx, is_predicate_idx - 1] = predicates["True"][0] correct_srls = srl_part[:, k] new_sent = np.concatenate([ np.expand_dims(word_part, -1), rest_part, np.expand_dims(correct_srls, -1) ], axis=1) buff2.append(new_sent) # print("new sent:", new_sent) # print("new preds:", map(lambda x: srls[int(x)], new_sent[:, -1])) # tokens_str = ' '.join(word_part) # labels_str = ' '.join(map(lambda x: srls[x], correct_srls)) ## idx, tokens, labels # print("%d %s ||| %s" % (p_idx, tokens_str, labels_str), file=tmp_f) total_predicates += 1 examples += 1 else: new_sent = np.concatenate( [np.expand_dims(word_part, -1), rest_part], axis=1) buff2.append(new_sent) examples += 1 # else: # buff2.append(np.concatenate[np.expand_dims(word_part, -1), rest_part, srl_part], axis=1) #(sent[0],) + map(int, sent[1:])) # examples += 1 # tmp_f.close() if self.one_example_per_predicate: print( "Loaded %d sentences with %d tokens, %d examples (%d predicates) (%s)" % (sents, toks, examples, total_predicates, self.name)) return buff2 else: print("Loaded %d sentences with %d tokens (%s)" % (sents, toks, self.name)) return buff #============================================================= def reset(self, sizes): """""" self._data = [] self._targets = [] self._metabucket.reset(sizes) return #============================================================= def rebucket(self): """""" buff = self._file_iterator.next() len_cntr = Counter() for sent in buff: len_cntr[len(sent)] += 1 self.reset(KMeans(self.n_bkts, len_cntr).splits) for sent in buff: self._metabucket.add(sent) self._finalize() return #============================================================= def _finalize(self): """""" self._metabucket._finalize() return #============================================================= def get_minibatches(self, batch_size, input_idxs, target_idxs, shuffle=True): """""" minibatches = [] for bkt_idx, bucket in enumerate(self._metabucket): if batch_size == 0: n_splits = 1 else: n_tokens = len(bucket) * bucket.size n_splits = max(n_tokens // batch_size, 1) if shuffle: range_func = np.random.permutation else: range_func = np.arange arr_sp = np.array_split(range_func(len(bucket)), n_splits) for bkt_mb in arr_sp: minibatches.append((bkt_idx, bkt_mb)) if shuffle: np.random.shuffle(minibatches) for bkt_idx, bkt_mb in minibatches: feed_dict = {} data = self[bkt_idx].data[bkt_mb] sents = self[bkt_idx].sents[bkt_mb] maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1)) # np.set_printoptions(threshold=np.nan) # print("maxlen", maxlen) # print("maxlen+max(target_idxs)", maxlen+max(target_idxs)) # print("data.shape[2]", data.shape[2]) # targets = data[:,:maxlen,min(target_idxs):maxlen+max(target_idxs)+1] # print("data shape", targets.shape) # print("data[:,:,3:] shape", targets[:,:,3:].shape) feed_dict.update({ self.inputs: data[:, :maxlen, input_idxs], self.targets: data[:, :maxlen, min(target_idxs):maxlen + max(target_idxs) + 1] }) yield feed_dict, sents #============================================================= @property def n_bkts(self): if self._train: return super(Dataset, self).n_bkts else: return super(Dataset, self).n_valid_bkts #============================================================= def __getitem__(self, key): return self._metabucket[key] def __len__(self): return len(self._metabucket)
class Dataset(Configurable): """""" #============================================================= def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" super(Dataset, self).__init__(*args, **kwargs) self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.vocabs = vocabs self.rebucket() self.inputs = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='inputs') self.targets = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='targets') self.builder = builder() #============================================================= def file_iterator(self, filename): """""" with open(filename) as f: if self.lines_per_buffer > 0: buff = [[]] while True: line = f.readline() while line: line = line.strip().split() if line: buff[-1].append(line) else: if len(buff) < self.lines_per_buffer: if buff[-1]: buff.append([]) else: break line = f.readline() if not line: f.seek(0) else: buff = self._process_buff(buff) yield buff line = line.strip().split() if line: buff = [[line]] else: buff = [[]] else: buff = [[]] for line in f: line = line.strip().split() if line: buff[-1].append(line) else: if buff[-1]: buff.append([]) if buff[-1] == []: buff.pop() buff = self._process_buff(buff) while True: yield buff #============================================================= def _process_buff(self, buff): """""" words, tags, rels = self.vocabs for i, sent in enumerate(buff): for j, token in enumerate(sent): word, tag1, tag2, head, rel = token[words.conll_idx], token[ tags.conll_idx[0]], token[ tags.conll_idx[1]], token[6], token[rels.conll_idx] buff[i][j] = ( word, ) + words[word] + tags[tag1] + tags[tag2] + ( int(head), ) + rels[rel] sent.insert(0, ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT)) return buff #============================================================= def reset(self, sizes): """""" self._data = [] self._targets = [] self._metabucket.reset(sizes) return #============================================================= def rebucket(self): """""" buff = self._file_iterator.next() len_cntr = Counter() for sent in buff: len_cntr[len(sent)] += 1 self.reset(KMeans(self.n_bkts, len_cntr).splits) for sent in buff: self._metabucket.add(sent) self._finalize() return #============================================================= def _finalize(self): """""" self._metabucket._finalize() return #============================================================= def get_minibatches(self, batch_size, input_idxs, target_idxs, shuffle=True): """""" minibatches = [] for bkt_idx, bucket in enumerate(self._metabucket): if batch_size == 0: n_splits = 1 else: n_tokens = len(bucket) * bucket.size n_splits = max(n_tokens // batch_size, 1) if shuffle: range_func = np.random.permutation else: range_func = np.arange arr_sp = np.array_split(range_func(len(bucket)), n_splits) for bkt_mb in arr_sp: minibatches.append((bkt_idx, bkt_mb)) if shuffle: np.random.shuffle(minibatches) for bkt_idx, bkt_mb in minibatches: feed_dict = {} data = self[bkt_idx].data[bkt_mb] sents = self[bkt_idx].sents[bkt_mb] maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1)) feed_dict.update({ self.inputs: data[:, :maxlen, input_idxs], self.targets: data[:, :maxlen, target_idxs] }) yield feed_dict, sents def get_minibatches_gemb_train(self, batch_size, input_idxs, target_idxs, shuffle=True): """""" minibatches = [] for bkt_idx, bucket in enumerate(self._metabucket): if batch_size == 0: n_splits = 1 else: n_tokens = len(bucket) * bucket.size n_splits = max(n_tokens // batch_size, 1) if shuffle: range_func = np.random.permutation else: range_func = np.arange arr_sp = np.array_split(range_func(len(bucket)), n_splits) for bkt_mb in arr_sp: minibatches.append((bkt_idx, bkt_mb)) if shuffle: np.random.shuffle(minibatches) for bkt_idx, bkt_mb in minibatches: feed_dict = {} data = self[bkt_idx].data[bkt_mb] sents = self[bkt_idx].sents[bkt_mb] unk_id = self.vocabs[0]._str2idx['<UNK>'] #no_oov = [i for i,s in enumerate(data[:,:,0]) if unk_id not in s] no_oov = list(range(len(sents))) # check for empty batch if len(no_oov) == 0: continue data_no_oov = data[no_oov] maxlen = np.max(np.sum(np.greater(data_no_oov[:, :, 0], 0), axis=1)) if maxlen <= 2: continue oov_pos = np.random.randint(1, maxlen - 1, size=len(no_oov)) # one oov each sent feed_dict.update({ self.inputs: data_no_oov[:, :maxlen, input_idxs], self.targets: data_no_oov[:, :maxlen, target_idxs] }) yield feed_dict, oov_pos, sents[no_oov] def get_minibatches_gemb_test(self, batch_size, input_idxs, target_idxs, shuffle=True): """""" batch_size = 1 # must be 1 during testing minibatches = [] for bkt_idx, bucket in enumerate(self._metabucket): if batch_size == 0: n_splits = 1 else: n_tokens = len(bucket) * bucket.size n_splits = max(n_tokens // batch_size, 1) if shuffle: range_func = np.random.permutation else: range_func = np.arange arr_sp = np.array_split(range_func(len(bucket)), n_splits) for bkt_mb in arr_sp: minibatches.append((bkt_idx, bkt_mb)) if shuffle: np.random.shuffle(minibatches) for bkt_idx, bkt_mb in minibatches: feed_dict = {} data = self[bkt_idx].data[bkt_mb] sents = self[bkt_idx].sents[bkt_mb] maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1)) unk_id = self.vocabs[0]._str2idx['<UNK>'] oov_pos = [np.where(s == unk_id) for s in sents] oov_pos = np.squeeze( oov_pos + \ np.reshape(oov_pos.shape[1] * \ np.arange(oov_pos.shape[0]), [-1,1]) ) feed_dict.update({ self.inputs: data[:, :maxlen, input_idxs], self.targets: data[:, :maxlen, target_idxs] }) yield feed_dict, oov_pos, sents #============================================================= @property def n_bkts(self): if self._train: return super(Dataset, self).n_bkts else: return super(Dataset, self).n_valid_bkts #============================================================= def __getitem__(self, key): return self._metabucket[key] def __len__(self): return len(self._metabucket)
class Dataset(Configurable): """""" #============================================================= def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" super(Dataset, self).__init__(*args, **kwargs) self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.vocabs = vocabs self.rebucket() self.inputs = tf.placeholder(dtype=tf.int32, shape=(None, None, None), name='inputs') self.targets = tf.placeholder(dtype=tf.int32, shape=(None, None), name='targets') self.sntmod = tf.placeholder(dtype=tf.float32, shape=(None, 3), name='sntmod') self.builder = builder() #============================================================= def file_iterator(self, filename): """""" with open(filename) as f: if self.lines_per_buffer > 0: buff = [[]] while True: line = f.readline() while line: line = line.strip().split() if line: buff[-1].append(line) else: if len(buff) < self.lines_per_buffer: if buff[-1]: buff.append([]) else: break line = f.readline() if not line: f.seek(0) else: buff = self._process_buff(buff) yield buff line = line.strip().split() if line: buff = [[line]] else: buff = [[]] else: buff = [[]] for line in f: line = line.strip().split() if line: buff[-1].append(line) else: if buff[-1]: buff.append([]) if buff[-1] == []: buff.pop() buff = self._process_buff(buff) while True: yield buff #============================================================= def _process_buff(self, buff): """""" words, tags = self.vocabs for i, sent in enumerate(buff): targetflag = 0 for j, token in enumerate(sent): if token[2] != 'o': targetflag = 1 word, tag, istarget, bftarget, aftarget, sentmod = token[ 0], token[1], 0 if token[2] == 'o' else 1, 1 if token[ 2] == 'o' and targetflag == 0 else 0, 1 if token[ 2] == 'o' and targetflag == 1 else 0, self.getmood( token[2]) buff[i][j] = (word, ) + words[word] + tags[tag] + ( int(istarget), ) + (int(bftarget), ) + ( int(aftarget), ) + (sentmod, ) return buff #============================================================= def getmood(self, polority): """""" if polority == 'o': return 0 else: polority = polority.split('-')[1] if polority == 'positive': return 2 elif polority == 'negative': return 4 else: return 6 #============================================================= def reset(self, sizes): """""" self._data = [] self._targets = [] self._metabucket.reset(sizes) return #============================================================= def rebucket(self): """""" buff = self._file_iterator.next() len_cntr = Counter() for sent in buff: len_cntr[len(sent)] += 1 self.reset(KMeans(self.n_bkts, len_cntr).splits) for sent in buff: self._metabucket.add(sent) self._finalize() return #============================================================= def _finalize(self): """""" self._metabucket._finalize() return #============================================================= def get_minibatches(self, batch_size, input_idxs, target_idxs, shuffle=True): """""" minibatches = [] for bkt_idx, bucket in enumerate(self._metabucket): if batch_size == 0: n_splits = 1 #elif not self.minimize_pads: # n_splits = max(len(bucket) // batch_size, 1) # if bucket.size > 100: # n_splits *= 2 else: n_tokens = len(bucket) * bucket.size n_splits = max(n_tokens // batch_size, 1) if shuffle: range_func = np.random.permutation else: range_func = np.arange arr_sp = np.array_split(range_func(len(bucket)), n_splits) for bkt_mb in arr_sp: if len(bkt_mb) > 0: minibatches.append((bkt_idx, bkt_mb)) if shuffle: np.random.shuffle(minibatches) for bkt_idx, bkt_mb in minibatches: data = self[bkt_idx].data[bkt_mb] sents = self[bkt_idx].sents[bkt_mb] sntmodp = self[bkt_idx].smod[bkt_mb] maxlen = np.max(np.sum(np.greater(data[:, :, 0], 0), axis=1)) # print("[tlog] maxlen\n"+str(maxlen)) feed_dict = { self.inputs: data[:, :maxlen, input_idxs], self.targets: data[:, :maxlen, target_idxs], self.sntmod: sntmodp } yield feed_dict, sents #============================================================= @property def n_bkts(self): if self._train: return super(Dataset, self).n_bkts else: return super(Dataset, self).n_valid_bkts #============================================================= def __getitem__(self, key): return self._metabucket[key] def __len__(self): return len(self._metabucket)
class Dataset(Configurable): """""" #============================================================= def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" super(Dataset, self).__init__(*args, **kwargs) self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.vocabs = vocabs self.rebucket() self.inputs = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='inputs') self.targets = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='targets') self.builder = builder() #============================================================= def file_iterator(self, filename): """""" with open(filename) as f: if self.lines_per_buffer > 0: buff = [[]] while True: line = f.readline() while line: line = line.strip().split() if line and line[0] != '#': buff[-1].append(line) else: if len(buff) < self.lines_per_buffer: if buff[-1]: buff.append([]) else: break line = f.readline() if not line: f.seek(0) else: buff = self._process_buff(buff) yield buff line = line.strip().split() if line and line[0] != '#': buff = [[line]] else: buff = [[]] else: buff = [[]] for line in f: line = line.strip().split() if line and line[0] != '#': buff[-1].append(line) else: if buff[-1]: buff.append([]) if buff[-1] == []: buff.pop() buff = self._process_buff(buff) while True: yield buff #============================================================= def _process_buff(self, buff): """""" words, tags, rels = self.vocabs for i, sent in enumerate(buff): new_sent = [None]*(len(sent)+1) for j, token in enumerate(sent): try: word, tag1, tag2, head, rel = token[words.conll_idx], token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx] if head.isdigit(): new_sent[int(token[0])] = (word,) + words[word] + tags[tag1] + tags[tag2] + (int(head),) + rels[rel] except Exception as e: print(token) raise(e) new_sent[0] = ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT) buff[i] = [x for x in new_sent if x is not None] return buff #============================================================= def reset(self, sizes): """""" self._data = [] self._targets = [] self._metabucket.reset(sizes) return #============================================================= def rebucket(self): """""" buff = self._file_iterator.next() len_cntr = Counter() for sent in buff: len_cntr[len(sent)] += 1 self.reset(KMeans(self.n_bkts, len_cntr).splits) for sent in buff: self._metabucket.add(sent) self._finalize() return #============================================================= def _finalize(self): """""" self._metabucket._finalize() return #============================================================= def get_minibatches(self, batch_size, input_idxs, target_idxs, shuffle=True): """""" minibatches = [] for bkt_idx, bucket in enumerate(self._metabucket): if batch_size == 0: n_splits = 1 else: n_tokens = len(bucket) * bucket.size n_splits = max(n_tokens // batch_size, 1) if shuffle: range_func = np.random.permutation else: range_func = np.arange arr_sp = np.array_split(range_func(len(bucket)), n_splits) for bkt_mb in arr_sp: minibatches.append( (bkt_idx, bkt_mb) ) if shuffle: np.random.shuffle(minibatches) for bkt_idx, bkt_mb in minibatches: feed_dict = {} data = self[bkt_idx].data[bkt_mb] sents = self[bkt_idx].sents[bkt_mb] maxlen = np.max(np.sum(np.greater(data[:,:,0], 0), axis=1)) feed_dict.update({ self.inputs: data[:,:maxlen,input_idxs], self.targets: data[:,:maxlen,target_idxs] }) yield feed_dict, sents #============================================================= @property def n_bkts(self): if self._train: return super(Dataset, self).n_bkts else: return super(Dataset, self).n_valid_bkts #============================================================= def __getitem__(self, key): return self._metabucket[key] def __len__(self): return len(self._metabucket)
class Dataset(Configurable): """""" #============================================================= def __init__(self, filename, vocabs, builder, *args, **kwargs): """""" super(Dataset, self).__init__(*args, **kwargs) self.vocabs = vocabs self.train_domains_set = set(self.train_domains.split(',')) if self.train_domains != '-' and self.name == "Trainset" else set() print("Loading training data from domains:", self.train_domains_set if self.train_domains_set else "all") self._file_iterator = self.file_iterator(filename) self._train = (filename == self.train_file) self._metabucket = Metabucket(self._config, n_bkts=self.n_bkts) self._data = None self.rebucket() self.inputs = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='inputs') self.targets = tf.placeholder(dtype=tf.int32, shape=(None,None,None), name='targets') self.builder = builder() #============================================================= def file_iterator(self, filename): """""" with open(filename) as f: if self.lines_per_buffer > 0: buff = [[]] while True: line = f.readline() while line: line = line.strip().split() if line and (not self.train_domains_set or line[0].split('/')[0] in self.train_domains): buff[-1].append(line) else: if len(buff) < self.lines_per_buffer: if len(buff[-1]) > 0: buff.append([]) else: buff[-1] = [] else: break line = f.readline() if not line: f.seek(0) else: buff = self._process_buff(buff) yield buff line = line.strip().split() if line: buff = [[line]] else: buff = [[]] else: buff = [[]] for line in f: line = line.strip().split() if line and (not self.train_domains_set or line[0].split('/')[0] in self.train_domains): buff[-1].append(line) else: if len(buff[-1]) > 0: buff.append([]) else: buff[-1] = [] if buff[-1] == []: buff.pop() buff = self._process_buff(buff) while True: yield buff #============================================================= def _process_buff(self, buff): """""" words, tags, rels, srls, trigs, domains = self.vocabs srl_start_field = srls.conll_idx[0] sents = 0 toks = 0 for i, sent in enumerate(buff): # if not self.conll2012 or (self.conll2012 and len(list(sent)) > 1): # print(sent, len(sent)) sents += 1 sent_len = len(sent) num_fields = len(sent[0]) srl_take_indices = [idx for idx in range(srl_start_field, srl_start_field + sent_len) if idx < num_fields - 1 and (self.train_on_nested or np.all(['/' not in sent[j][idx] for j in range(sent_len)]))] for j, token in enumerate(sent): toks += 1 if self.conll: word, tag1, tag2, head, rel = token[words.conll_idx], token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx] if rel == 'root': head = j else: head = int(head) - 1 buff[i][j] = (word,) + words[word] + tags[tag1] + tags[tag2] + (head,) + rels[rel] elif self.conll2012: word, auto_tag, gold_tag, head, rel = token[words.conll_idx], token[tags.conll_idx[0]], token[tags.conll_idx[1]], token[6], token[rels.conll_idx] domain = token[0].split('/')[0] # print(word, tag1, tag2, head, rel) if rel == 'root': head = j else: head = int(head) - 1 # srl_fields = [token[idx] if idx < len(token)-1 else 'O' for idx in range(srl_start_field, srl_start_field + sent_len)] srl_fields = [token[idx] for idx in srl_take_indices] # todo can we use fancy indexing here? srl_fields += ['O'] * (sent_len - len(srl_take_indices)) #np.any([s in self.trigger_indices for s in srl_tags]) srl_tags = [srls[s][0] for s in srl_fields] if self.joint_pos_predicates: is_trigger = token[trigs.conll_idx[0]] != '-' and (self.train_on_nested or self.trigger_str in srl_fields) trigger_str = str(is_trigger) + '/' + gold_tag else: is_trigger = token[trigs.conll_idx] != '-' and (self.train_on_nested or self.trigger_str in srl_fields) trigger_str = str(is_trigger) buff[i][j] = (word,) + words[word] + tags[auto_tag] + trigs[trigger_str] + domains[domain] + tags[gold_tag] + (head,) + rels[rel] + tuple(srl_tags) # sent.insert(0, ('root', Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT)) print("Loaded %d sentences with %d tokens (%s)" % (sents, toks, self.name)) return buff #============================================================= def reset(self, sizes): """""" self._data = [] self._targets = [] self._metabucket.reset(sizes) return #============================================================= def rebucket(self): """""" buff = self._file_iterator.next() len_cntr = Counter() for sent in buff: len_cntr[len(sent)] += 1 self.reset(KMeans(self.n_bkts, len_cntr).splits) for sent in buff: self._metabucket.add(sent) self._finalize() return #============================================================= def _finalize(self): """""" self._metabucket._finalize() return #============================================================= def get_minibatches(self, batch_size, input_idxs, target_idxs, shuffle=True): """""" minibatches = [] for bkt_idx, bucket in enumerate(self._metabucket): if batch_size == 0: n_splits = 1 else: n_tokens = len(bucket) * bucket.size n_splits = max(n_tokens // batch_size, 1) if shuffle: range_func = np.random.permutation else: range_func = np.arange arr_sp = np.array_split(range_func(len(bucket)), n_splits) for bkt_mb in arr_sp: minibatches.append( (bkt_idx, bkt_mb) ) if shuffle: np.random.shuffle(minibatches) for bkt_idx, bkt_mb in minibatches: feed_dict = {} data = self[bkt_idx].data[bkt_mb] sents = self[bkt_idx].sents[bkt_mb] maxlen = np.max(np.sum(np.greater(data[:,:,0], 0), axis=1)) np.set_printoptions(threshold=np.nan) # print("maxlen", maxlen) # print("maxlen+max(target_idxs)", maxlen+max(target_idxs)) # print("data.shape[2]", data.shape[2]) # targets = data[:,:maxlen,min(target_idxs):maxlen+max(target_idxs)+1] # print("data shape", targets.shape) # print("data[:,:,3:] shape", targets[:,:,3:].shape) feed_dict.update({ self.inputs: data[:,:maxlen,input_idxs], self.targets: data[:,:maxlen,min(target_idxs):maxlen+max(target_idxs)+1] }) yield feed_dict, sents #============================================================= @property def n_bkts(self): if self._train: return super(Dataset, self).n_bkts else: return super(Dataset, self).n_valid_bkts #============================================================= def __getitem__(self, key): return self._metabucket[key] def __len__(self): return len(self._metabucket)