def read_matrix_and_idmap_from_file(fn, dic=dict(), ecd='utf-8'): idx_map = dic.copy() with codecs.open(fn, encoding=ecd, errors='ignore') as inf: row, column = inf.readline().rstrip().split(' ') dim = int(column) multiplier = 1 #np.sqrt(1.0/3) #(2.0/dim) #print row, column # note: here +3 include OOV, BOS and EOS content = [] not_in_dict = 0 for line in inf: elems = line.rstrip().split(' ') content.append(elems) if elems[0] not in idx_map: idx_map[elems[0]] = len(idx_map) not_in_dict += 1 print 'load embedding! %s words, %d not in the dictionary. Dictionary size: %d' % ( row, not_in_dict, len(dic)) assert len(idx_map) == len(dic) + not_in_dict M = ArrayInit(ArrayInit.onesided_uniform, multiplier=1.0 / dim).initialize(len(idx_map) + 2, dim) #*2) for elems in content: if elems[0] in idx_map: idx = idx_map[elems[0]] vec_elem = elems[1:] #.split(',') #print len(vec_elem), elems r = np.array([float(_e) for _e in vec_elem]) M[idx, :dim] = (r / np.linalg.norm(r)) * multiplier else: print 'error in load embedding matrix!!' print 'embedding matrix shape:', M.shape, 'word map size:', len(idx_map) return M, idx_map
def read_matrix_from_pkl(fn, dic): ''' Assume that the file contains words in first column, and embeddings in the rest and that dic maps words to indices. ''' voc, data = pickle.load(codecs.open(fn)) dim = len(data[0]) # NOTE: The norm of onesided_uniform rv is sqrt(n)/sqrt(3) # Since the expected value of X^2 = 1/3 where X ~ U[0, 1] # => sum(X_i^2) = dim/3 # => norm = sqrt(dim/3) # => norm/dim = sqrt(1/3dim) multiplier = np.sqrt(1.0 / (3 * dim)) M = ArrayInit(ArrayInit.onesided_uniform, multiplier=1.0 / dim).initialize(len(dic), dim) not_in_dict = 0 for i, e in enumerate(data): r = np.array([float(_e) for _e in e]) if voc[i] in dic: idx = dic[voc[i]] M[idx] = (r / np.linalg.norm(r)) * multiplier else: not_in_dict += 1 print 'load embedding! %d words, %d not in the dictionary. Dictionary size: %d' % ( len(voc), not_in_dict, len(dic)) return M
def read_matrix_from_file(fn, dic, ecd='utf-8'): not_in_dict = 0 with codecs.open(fn, encoding=ecd, errors='ignore') as inf: row, column = inf.readline().rstrip().split() dim = int(column) multiplier = np.sqrt(1.0 / 3) #print row, column idx_map = dict() line_count = 0 M = ArrayInit(ArrayInit.onesided_uniform, multiplier=1.0 / dim).initialize(len(dic) + 2, dim) for line in inf: elems = line.rstrip().split(' ') if elems[0] in dic: idx = dic[elems[0]] vec_elem = elems[1:] #.split(',') r = np.array([float(_e) for _e in vec_elem]) M[idx] = (r / np.linalg.norm(r)) * multiplier idx_map[idx] = line_count else: not_in_dict += 1 line_count += 1 print 'load embedding! %s words, %d not in the dictionary. Dictionary size: %d' % ( row, not_in_dict, len(dic)) print 'embedding matrix shape:', M.shape, 'word map size:', len(idx_map) return M, idx_map