def load_w2v(file, type='txt', header=True, check_zero=True): """ Load word embedding original file * file [str]: load file/path * type [str]: use 'bin'/'txt' to load '.bin'/'.txt' file * check_zero [bool]: check whether the first line is a zero vector (need type='txt') - w2v [word_vector]: word_vector class """ if type == 'bin': dot.start("* Load word embedding") from gensim.models.keyedvectors import KeyedVectors w2v = KeyedVectors.load_word2vec_format(file, binary=True) dot.stop() elif type == 'txt': with cs.open(file) as fobj: line = fobj.readline().rstrip() if header: num_word, vector_size = map(int, line.split()) line = fobj.readline().rstrip() else: vector_size = len(line.split(' ')) - 1 if check_zero: first_vec = list(map(float, line.split(' ')[1:])) add_zero = True if any(first_vec) else False else: add_zero = False w2v = word_vector(vector_size, add_zero) if header: for _ in bar(num_word, "* Load word embedding"): line = line.rstrip().split(' ') word, vector = line[0], np.array(line[1:], dtype=float) w2v[word] = vector line = fobj.readline() else: dot.start("* Load word embedding") while line: line = line.rstrip().split(' ') word, vector = line[0], np.array(line[1:], dtype=float) w2v[word] = vector line = fobj.readline() dot.stop() else: raise ValueError("Value error of 'type', want 'txt'/'bin', get '{}'.".format(type)) print("- Word embedding size:", vector_size) return w2v
def save_dict(_dict, file, desc=None, line_split=False, code='utf-8'): """ Save a dict * _dict [dict]: dict for saving * file [str]: save file/path * desc [str]: a description string * line_split [bool]: each line contains one element * code [str]: encoding """ if desc: dot.start("* Save {}".format(desc)) with cs.open(file, 'w', code) as outobj: if line_split: for key, value in _dict.items(): outobj.write(str(key) + '\t' + json.dumps(value) + '\n') else: json.dump(_dict, outobj) if desc: dot.stop() return 1
def save_list(_list, file, desc=None, line_split=False, code='utf-8'): """ Save a list * _list [list]: list for saving * file [str]: save file/path * desc [str]: a description string * line_split [bool]: each line contains one element * code [str]: encoding """ if desc: dot.start("* Save {}".format(desc)) with cs.open(file, 'w', code) as outobj: if line_split: for ele in _list: outobj.write(json.dumps(ele) + '\n') else: json.dump(_list, outobj) if desc: dot.stop() return 1
def load_list(file, desc=None, line_split=False, code='utf-8'): """ Load a file to list * file [str]: load file/path * desc [str]: a description string * line_split [bool]: each line contains one element * code [str]: encoding - _list [list]: result list """ if desc: dot.start("* Load {}".format(desc)) _list = [] with cs.open(file, 'r', code) as inobj: if line_split: for line in inobj: ele = json.loads(line) _list.append(ele) else: _list = json.load(inobj) if desc: dot.stop() return _list
def load_dict(file, desc=None, line_split=False, code='utf-8'): """ Load a file to dict * file [str]: load file/path * desc [str]: a description string * line_split [bool]: each line contains one element * code [str]: encoding - _dict [dict]: result dict """ if desc: dot.start("* Load {}".format(desc)) _dict = {} with cs.open(file, 'r', code) as inobj: if line_split: for line in inobj: key, value = line.split('\t', 1) value = json.loads(value) _dict[key] = value else: _dict = json.load(inobj) if desc: dot.stop() return _dict