Beispiel #1
0
def seq2idx(src, tar, vec_json, src_encoding="utf-8", tar_encoding="utf-8"):
    """convert token sequences in json format into idx sequence in json format"""
    vec_dict = WVDict.from_file(vec_json)
    with rf_open(src, encoding=src_encoding) as f, wf_open(
            tar, encoding=tar_encoding) as wf:
        for line in tqdm(f, desc="converting %s -> %s" % (src, tar)):
            print(json.dumps(vec_dict.token2idx(json.loads(line))), file=wf)
Beispiel #2
0
def json2csv(src, tar, delimiter=' '):
    with rf_open(src) as f, wf_open(tar) as wf:
        writer = csv.writer(wf, delimiter=delimiter)
        for line in f:
            token, vec = json.loads(line)
            writer.writerow([token] + list(map(str, vec)))
    return tar
Beispiel #3
0
def csv2json(src, tar, delimiter=' ', skip_first_line=False):
    with rf_open(src) as f, wf_open(tar) as wf:
        if skip_first_line:  # pragma: no cover
            f.readline()
        for line in tqdm(csv.reader(f, delimiter=delimiter),
                         "csv2json: %s --> %s" % (src, tar)):
            token = line[0]
            vec = list(map(float, line[1:]))
            print(json.dumps([token, vec]), file=wf)
    return tar
Beispiel #4
0
def load_vec_json(vec_json) -> tuple:
    _token2idx = {}
    _idx2token = []
    _idx2vec = []
    _dim = None
    with rf_open(vec_json) as f:
        for i, line in tqdm(enumerate(f), "loading %s" % vec_json):
            print(line)
            _word, _vec = json.loads(line)
            assert _word not in _token2idx, "duplicate: %s" % _word
            if _dim is None:
                _dim = len(_vec)
                assert _dim > 0, "empty vec %s" % _word
            else:
                assert len(
                    _vec) == _dim, "dimension inconsistent %s vs %s: %s" % (
                        _dim, _word, len(_vec))
            _token2idx[_word] = i
            _idx2vec.append(_vec)
            _idx2token.append(_word)

    return _token2idx, _idx2vec, _idx2token
Beispiel #5
0
    def _load_embedding_txt(pretrained_file_path,
                            elem_delim,
                            unknown_token,
                            init_unknown_vec,
                            encoding='utf8'):
        """Load embedding vectors from a pre-trained token embedding file.

        Returns idx_to_token, idx_to_vec and unknown_token suitable for the
        TokenEmbedding constructor.

        For every unknown token, if its representation `unknown_token` is encountered in the
        pre-trained token embedding file, index 0 of `idx_to_vec` maps to the pre-trained token
        embedding vector loaded from the file; otherwise, index 0 of `idx_to_vec` maps to the
        text embedding vector initialized by `init_unknown_vec`.

        If a token is encountered multiple times in the pre-trained text embedding file, only the
        first-encountered token embedding vector will be loaded and the rest will be skipped.

        """
        idx_to_token = [unknown_token] if unknown_token else []
        unk_idx = None
        if unknown_token:
            unk_idx = 0

        vec_len = None
        all_elems = []
        tokens = set()
        loaded_unknown_vec = None
        with rf_open(pretrained_file_path, encoding=encoding) as f:
            for line_num, elems in enumerate(
                    csv.reader(f, delimiter=elem_delim)):

                assert len(
                    elems
                ) > 1, 'line {} in {}: unexpected data format.'.format(
                    line_num, pretrained_file_path)

                token, elems = elems[0], [float(i) for i in elems[1:]]

                if loaded_unknown_vec is None and token == unknown_token:
                    loaded_unknown_vec = elems
                    tokens.add(unknown_token)
                elif token in tokens:
                    warnings.warn(
                        'line {} in {}: duplicate embedding found for '
                        'token "{}". Skipped.'.format(line_num,
                                                      pretrained_file_path,
                                                      token))
                elif len(elems) == 1 and line_num == 0:
                    warnings.warn(
                        'line {} in {}: skipped likely header line.'.format(
                            line_num, pretrained_file_path))
                else:
                    if not vec_len:
                        vec_len = len(elems)
                        if unknown_token:
                            # Reserve a vector slot for the unknown token at the very beggining
                            # because the unknown token index is 0.
                            assert len(all_elems) == 0
                            all_elems.extend([0] * vec_len)
                    else:
                        assert len(elems) == vec_len, \
                            'line {} in {}: found vector of inconsistent dimension for token ' \
                            '"{}". expected dim: {}, found: {}'.format(line_num,
                                                                       pretrained_file_path,
                                                                       token, vec_len, len(elems))
                    all_elems.extend(elems)
                    idx_to_token.append(token)
                    tokens.add(token)

        idx_to_vec = nd.array(all_elems).reshape((-1, vec_len))

        if unknown_token:
            if loaded_unknown_vec is None:
                idx_to_vec[unk_idx] = init_unknown_vec(shape=vec_len)
            else:
                idx_to_vec[unk_idx] = nd.array(loaded_unknown_vec)

        return idx_to_token, idx_to_vec, unknown_token
Beispiel #6
0
def test_token2idx(token_seq, vec_json):
    seq2idx(token_seq, token_seq + ".idx", vec_json)

    with rf_open(token_seq + ".idx") as f:
        assert json.loads(f.readline()) == [0, 1]
        assert json.loads(f.readline()) == [2]