def get_minibatch(batch: List[Dict], vocab: Vocabulary, use_cuda: bool) -> Dict[str, Any]: batch = sorted(batch, key=lambda x: len(x['tokens']), reverse=True) batch_seq_len = [len(instance['tokens']) for instance in batch] max_seq_len = max(batch_seq_len) max_char_seq_len = max( [len(tok) for instance in batch for tok in instance['token_chars']]) outputs = defaultdict(list) token_padding_idx = vocab.get_token_index(vocab._padding_token, 'tokens') char_padding_idx = vocab.get_token_index(vocab._padding_token, 'token_chars') label_padding_idx = -1 for instance in batch: cur_seq_len = len(instance['tokens']) outputs['tokens'].append(instance['tokens'] + [token_padding_idx] * (max_seq_len - cur_seq_len)) outputs['ent_labels'].append(instance['ent_labels'] + [label_padding_idx] * (max_seq_len - cur_seq_len)) outputs['ent_span_labels'].append(instance['ent_span_labels'] + [label_padding_idx] * (max_seq_len - cur_seq_len)) outputs['candi_rels'].append(instance['candi_rels']) outputs['ent_ids'].append(instance['ent_ids']) outputs['ent_ids_labels'].append(instance['ent_ids_labels']) outputs['rel_labels'].append(instance['rel_labels']) char_pad = [] for char_seq in instance['token_chars']: char_pad.append(char_seq + [char_padding_idx] * (max_char_seq_len - len(char_seq))) char_pad = char_pad + [[char_padding_idx] * max_char_seq_len ] * (max_seq_len - cur_seq_len) outputs['token_chars'].append(char_pad) outputs['tokens'] = torch.LongTensor(outputs['tokens']) outputs['token_chars'] = torch.LongTensor(outputs['token_chars']) outputs['ent_labels'] = torch.LongTensor(outputs['ent_labels']) outputs['ent_span_labels'] = torch.LongTensor(outputs['ent_span_labels']) outputs['seq_lens'] = batch_seq_len if use_cuda: outputs['tokens'] = outputs['tokens'].cuda(non_blocking=True) outputs['token_chars'] = outputs['token_chars'].cuda(non_blocking=True) outputs['ent_labels'] = outputs['ent_labels'].cuda(non_blocking=True) outputs['ent_span_labels'] = outputs['ent_span_labels'].cuda( non_blocking=True) return outputs
def seqchar2number(instance: Dict, vocab: Vocabulary, lower_case: bool) -> List[List]: nums = [] for token in instance['tokens']: nums.append([ vocab.get_token_index(item.lower() if lower_case else item, 'token_chars') for item in token ]) return nums
def load_word_vectors(vector_file: str, ndims: int, vocab: Vocabulary, namespace: str = 'tokens') -> List[List]: token_vocab_size = vocab.get_vocab_size(namespace) oov_idx = vocab.get_token_index(vocab._oov_token, namespace) padding_idx = vocab.get_token_index(vocab._padding_token, namespace) W = np.random.uniform(-0.25, 0.25, (token_vocab_size, ndims)) W[padding_idx, :] = 0.0 total, found = 0, 0 with open(vector_file) as fp: for i, line in enumerate(fp): line = line.rstrip().split() if line: total += 1 try: assert len(line) == ndims + 1, ( "Line[{}] {} vector dims {} doesn't match ndims={}". format(i, line[0], len(line) - 1, ndims)) except AssertionError as e: print(e) continue word = line[0] idx = vocab.get_token_index(word, namespace) if idx != oov_idx: found += 1 vecs = np.array(list(map(float, line[1:]))) W[idx, :] = vecs print("Found {} [{:.2f}%] vectors from {} vectors in {} with ndims={}". format(found, found * 100 / token_vocab_size, total, vector_file, ndims)) # norm_W = np.sqrt((W*W).sum(axis=1, keepdims=True)) # valid_idx = norm_W.squeeze() != 0 # W[valid_idx, :] /= norm_W[valid_idx] return W
def data2number(corpus: List[Dict], vocab: Vocabulary) -> List[Dict]: instances = [] oov_idx = vocab.get_token_index(vocab._oov_token, 'tokens') for e in corpus: instance = {} instance['tokens'] = seq2number(e, vocab, 'tokens', True) instance['token_chars'] = seqchar2number(e, vocab, False) instance['ent_labels'] = seq2number(e, vocab, 'ent_labels', False) instance['rel_labels'] = seq2number(e, vocab, 'rel_labels', False) instance['candi_rels'] = e['candi_rels'] assert all([oov_idx != n for n in instance['tokens']]) assert all([oov_idx != m for n in instance['token_chars'] for m in n]) instances.append(instance) return instances
def seq2number(instance: Dict, vocab: Vocabulary, namespace: str, lower_case: bool) -> List: return [ vocab.get_token_index(item.lower() if lower_case else item, namespace) for item in instance[namespace] ]