Esempio n. 1
0
def get_minibatch(batch: List[Dict], vocab: Vocabulary,
                  use_cuda: bool) -> Dict[str, Any]:
    batch = sorted(batch, key=lambda x: len(x['tokens']), reverse=True)
    batch_seq_len = [len(instance['tokens']) for instance in batch]
    max_seq_len = max(batch_seq_len)
    max_char_seq_len = max(
        [len(tok) for instance in batch for tok in instance['token_chars']])

    outputs = defaultdict(list)
    token_padding_idx = vocab.get_token_index(vocab._padding_token, 'tokens')
    char_padding_idx = vocab.get_token_index(vocab._padding_token,
                                             'token_chars')
    label_padding_idx = -1
    for instance in batch:
        cur_seq_len = len(instance['tokens'])

        outputs['tokens'].append(instance['tokens'] + [token_padding_idx] *
                                 (max_seq_len - cur_seq_len))
        outputs['ent_labels'].append(instance['ent_labels'] +
                                     [label_padding_idx] *
                                     (max_seq_len - cur_seq_len))
        outputs['ent_span_labels'].append(instance['ent_span_labels'] +
                                          [label_padding_idx] *
                                          (max_seq_len - cur_seq_len))
        outputs['candi_rels'].append(instance['candi_rels'])
        outputs['ent_ids'].append(instance['ent_ids'])
        outputs['ent_ids_labels'].append(instance['ent_ids_labels'])
        outputs['rel_labels'].append(instance['rel_labels'])
        char_pad = []
        for char_seq in instance['token_chars']:
            char_pad.append(char_seq + [char_padding_idx] *
                            (max_char_seq_len - len(char_seq)))
        char_pad = char_pad + [[char_padding_idx] * max_char_seq_len
                               ] * (max_seq_len - cur_seq_len)
        outputs['token_chars'].append(char_pad)
    outputs['tokens'] = torch.LongTensor(outputs['tokens'])
    outputs['token_chars'] = torch.LongTensor(outputs['token_chars'])
    outputs['ent_labels'] = torch.LongTensor(outputs['ent_labels'])
    outputs['ent_span_labels'] = torch.LongTensor(outputs['ent_span_labels'])
    outputs['seq_lens'] = batch_seq_len
    if use_cuda:
        outputs['tokens'] = outputs['tokens'].cuda(non_blocking=True)
        outputs['token_chars'] = outputs['token_chars'].cuda(non_blocking=True)
        outputs['ent_labels'] = outputs['ent_labels'].cuda(non_blocking=True)
        outputs['ent_span_labels'] = outputs['ent_span_labels'].cuda(
            non_blocking=True)
    return outputs
Esempio n. 2
0
def seqchar2number(instance: Dict, vocab: Vocabulary,
                   lower_case: bool) -> List[List]:
    nums = []
    for token in instance['tokens']:
        nums.append([
            vocab.get_token_index(item.lower() if lower_case else item,
                                  'token_chars') for item in token
        ])
    return nums
Esempio n. 3
0
def load_word_vectors(vector_file: str,
                      ndims: int,
                      vocab: Vocabulary,
                      namespace: str = 'tokens') -> List[List]:
    token_vocab_size = vocab.get_vocab_size(namespace)
    oov_idx = vocab.get_token_index(vocab._oov_token, namespace)
    padding_idx = vocab.get_token_index(vocab._padding_token, namespace)
    W = np.random.uniform(-0.25, 0.25, (token_vocab_size, ndims))
    W[padding_idx, :] = 0.0
    total, found = 0, 0
    with open(vector_file) as fp:
        for i, line in enumerate(fp):
            line = line.rstrip().split()
            if line:
                total += 1
                try:
                    assert len(line) == ndims + 1, (
                        "Line[{}] {} vector dims {} doesn't match ndims={}".
                        format(i, line[0],
                               len(line) - 1, ndims))
                except AssertionError as e:
                    print(e)
                    continue
                word = line[0]
                idx = vocab.get_token_index(word, namespace)
                if idx != oov_idx:
                    found += 1
                    vecs = np.array(list(map(float, line[1:])))
                    W[idx, :] = vecs
    print("Found {} [{:.2f}%] vectors from {} vectors in {} with ndims={}".
          format(found, found * 100 / token_vocab_size, total, vector_file,
                 ndims))
    #  norm_W = np.sqrt((W*W).sum(axis=1, keepdims=True))
    #  valid_idx = norm_W.squeeze() != 0
    #  W[valid_idx, :] /= norm_W[valid_idx]
    return W
Esempio n. 4
0
def data2number(corpus: List[Dict], vocab: Vocabulary) -> List[Dict]:
    instances = []
    oov_idx = vocab.get_token_index(vocab._oov_token, 'tokens')
    for e in corpus:
        instance = {}
        instance['tokens'] = seq2number(e, vocab, 'tokens', True)
        instance['token_chars'] = seqchar2number(e, vocab, False)
        instance['ent_labels'] = seq2number(e, vocab, 'ent_labels', False)
        instance['rel_labels'] = seq2number(e, vocab, 'rel_labels', False)
        instance['candi_rels'] = e['candi_rels']

        assert all([oov_idx != n for n in instance['tokens']])
        assert all([oov_idx != m for n in instance['token_chars'] for m in n])

        instances.append(instance)
    return instances
Esempio n. 5
0
def seq2number(instance: Dict, vocab: Vocabulary, namespace: str,
               lower_case: bool) -> List:
    return [
        vocab.get_token_index(item.lower() if lower_case else item, namespace)
        for item in instance[namespace]
    ]