Beispiel #1
0
def read_matrix_and_idmap_from_file(fn, dic=dict(), ecd='utf-8'):
    idx_map = dic.copy()
    with codecs.open(fn, encoding=ecd, errors='ignore') as inf:
        row, column = inf.readline().rstrip().split(' ')
        dim = int(column)
        multiplier = 1  #np.sqrt(1.0/3) #(2.0/dim)
        #print row, column
        # note: here +3 include OOV, BOS and EOS
        content = []
        not_in_dict = 0
        for line in inf:
            elems = line.rstrip().split(' ')
            content.append(elems)
            if elems[0] not in idx_map:
                idx_map[elems[0]] = len(idx_map)
                not_in_dict += 1
        print 'load embedding! %s words, %d not in the dictionary. Dictionary size: %d' % (
            row, not_in_dict, len(dic))
        assert len(idx_map) == len(dic) + not_in_dict
        M = ArrayInit(ArrayInit.onesided_uniform,
                      multiplier=1.0 / dim).initialize(len(idx_map) + 2,
                                                       dim)  #*2)
        for elems in content:
            if elems[0] in idx_map:
                idx = idx_map[elems[0]]
                vec_elem = elems[1:]  #.split(',')
                #print len(vec_elem), elems
                r = np.array([float(_e) for _e in vec_elem])
                M[idx, :dim] = (r / np.linalg.norm(r)) * multiplier
            else:
                print 'error in load embedding matrix!!'
    print 'embedding matrix shape:', M.shape, 'word map size:', len(idx_map)
    return M, idx_map
Beispiel #2
0
def read_matrix_from_pkl(fn, dic):
    '''
    Assume that the file contains words in first column,
    and embeddings in the rest and that dic maps words to indices.
    '''
    voc, data = pickle.load(codecs.open(fn))
    dim = len(data[0])
    # NOTE: The norm of onesided_uniform rv is sqrt(n)/sqrt(3)
    # Since the expected value of X^2 = 1/3 where X ~ U[0, 1]
    # => sum(X_i^2) = dim/3
    # => norm       = sqrt(dim/3)
    # => norm/dim   = sqrt(1/3dim)
    multiplier = np.sqrt(1.0 / (3 * dim))
    M = ArrayInit(ArrayInit.onesided_uniform,
                  multiplier=1.0 / dim).initialize(len(dic), dim)
    not_in_dict = 0
    for i, e in enumerate(data):
        r = np.array([float(_e) for _e in e])
        if voc[i] in dic:
            idx = dic[voc[i]]
            M[idx] = (r / np.linalg.norm(r)) * multiplier
        else:
            not_in_dict += 1
    print 'load embedding! %d words, %d not in the dictionary. Dictionary size: %d' % (
        len(voc), not_in_dict, len(dic))
    return M
Beispiel #3
0
def read_matrix_from_file(fn, dic, ecd='utf-8'):
    not_in_dict = 0
    with codecs.open(fn, encoding=ecd, errors='ignore') as inf:
        row, column = inf.readline().rstrip().split()
        dim = int(column)
        multiplier = np.sqrt(1.0 / 3)
        #print row, column
        idx_map = dict()
        line_count = 0
        M = ArrayInit(ArrayInit.onesided_uniform,
                      multiplier=1.0 / dim).initialize(len(dic) + 2, dim)
        for line in inf:
            elems = line.rstrip().split(' ')
            if elems[0] in dic:
                idx = dic[elems[0]]
                vec_elem = elems[1:]  #.split(',')
                r = np.array([float(_e) for _e in vec_elem])
                M[idx] = (r / np.linalg.norm(r)) * multiplier
                idx_map[idx] = line_count
            else:
                not_in_dict += 1
            line_count += 1
    print 'load embedding! %s words, %d not in the dictionary. Dictionary size: %d' % (
        row, not_in_dict, len(dic))
    print 'embedding matrix shape:', M.shape, 'word map size:', len(idx_map)
    return M, idx_map