def seq2idx(src, tar, vec_json, src_encoding="utf-8", tar_encoding="utf-8"): """convert token sequences in json format into idx sequence in json format""" vec_dict = WVDict.from_file(vec_json) with rf_open(src, encoding=src_encoding) as f, wf_open( tar, encoding=tar_encoding) as wf: for line in tqdm(f, desc="converting %s -> %s" % (src, tar)): print(json.dumps(vec_dict.token2idx(json.loads(line))), file=wf)
def json2csv(src, tar, delimiter=' '): with rf_open(src) as f, wf_open(tar) as wf: writer = csv.writer(wf, delimiter=delimiter) for line in f: token, vec = json.loads(line) writer.writerow([token] + list(map(str, vec))) return tar
def csv2json(src, tar, delimiter=' ', skip_first_line=False): with rf_open(src) as f, wf_open(tar) as wf: if skip_first_line: # pragma: no cover f.readline() for line in tqdm(csv.reader(f, delimiter=delimiter), "csv2json: %s --> %s" % (src, tar)): token = line[0] vec = list(map(float, line[1:])) print(json.dumps([token, vec]), file=wf) return tar
def load_vec_json(vec_json) -> tuple: _token2idx = {} _idx2token = [] _idx2vec = [] _dim = None with rf_open(vec_json) as f: for i, line in tqdm(enumerate(f), "loading %s" % vec_json): print(line) _word, _vec = json.loads(line) assert _word not in _token2idx, "duplicate: %s" % _word if _dim is None: _dim = len(_vec) assert _dim > 0, "empty vec %s" % _word else: assert len( _vec) == _dim, "dimension inconsistent %s vs %s: %s" % ( _dim, _word, len(_vec)) _token2idx[_word] = i _idx2vec.append(_vec) _idx2token.append(_word) return _token2idx, _idx2vec, _idx2token
def _load_embedding_txt(pretrained_file_path, elem_delim, unknown_token, init_unknown_vec, encoding='utf8'): """Load embedding vectors from a pre-trained token embedding file. Returns idx_to_token, idx_to_vec and unknown_token suitable for the TokenEmbedding constructor. For every unknown token, if its representation `unknown_token` is encountered in the pre-trained token embedding file, index 0 of `idx_to_vec` maps to the pre-trained token embedding vector loaded from the file; otherwise, index 0 of `idx_to_vec` maps to the text embedding vector initialized by `init_unknown_vec`. If a token is encountered multiple times in the pre-trained text embedding file, only the first-encountered token embedding vector will be loaded and the rest will be skipped. """ idx_to_token = [unknown_token] if unknown_token else [] unk_idx = None if unknown_token: unk_idx = 0 vec_len = None all_elems = [] tokens = set() loaded_unknown_vec = None with rf_open(pretrained_file_path, encoding=encoding) as f: for line_num, elems in enumerate( csv.reader(f, delimiter=elem_delim)): assert len( elems ) > 1, 'line {} in {}: unexpected data format.'.format( line_num, pretrained_file_path) token, elems = elems[0], [float(i) for i in elems[1:]] if loaded_unknown_vec is None and token == unknown_token: loaded_unknown_vec = elems tokens.add(unknown_token) elif token in tokens: warnings.warn( 'line {} in {}: duplicate embedding found for ' 'token "{}". Skipped.'.format(line_num, pretrained_file_path, token)) elif len(elems) == 1 and line_num == 0: warnings.warn( 'line {} in {}: skipped likely header line.'.format( line_num, pretrained_file_path)) else: if not vec_len: vec_len = len(elems) if unknown_token: # Reserve a vector slot for the unknown token at the very beggining # because the unknown token index is 0. assert len(all_elems) == 0 all_elems.extend([0] * vec_len) else: assert len(elems) == vec_len, \ 'line {} in {}: found vector of inconsistent dimension for token ' \ '"{}". expected dim: {}, found: {}'.format(line_num, pretrained_file_path, token, vec_len, len(elems)) all_elems.extend(elems) idx_to_token.append(token) tokens.add(token) idx_to_vec = nd.array(all_elems).reshape((-1, vec_len)) if unknown_token: if loaded_unknown_vec is None: idx_to_vec[unk_idx] = init_unknown_vec(shape=vec_len) else: idx_to_vec[unk_idx] = nd.array(loaded_unknown_vec) return idx_to_token, idx_to_vec, unknown_token
def test_token2idx(token_seq, vec_json): seq2idx(token_seq, token_seq + ".idx", vec_json) with rf_open(token_seq + ".idx") as f: assert json.loads(f.readline()) == [0, 1] assert json.loads(f.readline()) == [2]