Ejemplo n.º 1
0
def main():
    args = docopt("""
    Usage:
        pmi2svd.py [options] <pmi_path> <output_path>
    
    Options:
        --dim NUM    Dimensionality of eigenvectors [default: 500]
        --neg NUM    Number of negative samples; subtracts its log from PMI [default: 1]
    """)
    
    pmi_path = args['<pmi_path>']
    output_path = args['<output_path>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    
    explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)

    start = time.time()
    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)
    print("Time elapsed for SVD: %f" % (time.time() - start))

    np.save(output_path + '.ut.npy', ut)
    np.save(output_path + '.s.npy', s)
    np.save(output_path + '.vt.npy', vt)
    save_vocabulary(output_path + '.words.vocab', explicit.iw)
    save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
Ejemplo n.º 2
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py [options] <counts> <output_path> 
    
    Options:
        --cds NUM    Context distribution smoothing [default: 1.0]
    """)

    counts_path = args['<counts>']
    vectors_path = args['<output_path>']
    cds = float(args['--cds'])

    #words = load_count_vocabulary(counts_path + '.words.vocab')
    #contexts = load_count_vocabulary(counts_path + '.contexts.vocab')
    #loader = np.load(counts_path+'.pairs.counts.npz')
    #counts = csr_matrix((loader['data'], loader['indices'], loader['indptr']))
    #counts, iw, ic = read_counts_matrix(counts_path)
    #iw = sorted(words)
    #ic = sorted(contexts)

    counts, iw, ic = read_counts_matrix(counts_path)
    pmi = calc_pmi(counts, cds, alpha=1.0)

    #words = load_count_vocabulary(counts_path + '.words.vocab')
    #contexts = load_count_vocabulary(counts_path + '.contexts.vocab')
    #iw = sorted(words)
    #ic = sorted(contexts)

    save_matrix(vectors_path + '.count_matrix', counts)

    save_matrix(vectors_path, pmi)
    save_vocabulary(vectors_path + '.words.vocab', iw)
    save_vocabulary(vectors_path + '.contexts.vocab', ic)
Ejemplo n.º 3
0
def main():
    args = docopt("""
    Usage:
        pmi2svd.py [options] <repres> <pmi_path> <output_path>
    
    Options:
        --dim NUM    Dimensionality of eigenvectors [default: 500]
        --neg NUM    Number of negative samples; subtracts its log from PMI [default: 1]
        --k NUM [default: 1]
    """)

    repres = args['<repres>']
    pmi_path = args['<pmi_path>']
    output_path = args['<output_path>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    k = int(args['--k'])

    if (repres == "BPMI"):
        explicit = BinExplicit(pmi_path, normalize=False)
    elif (repres == "PMI"):
        explicit = NoExplicit(pmi_path, normalize=False, k=k)
    elif (repres == "NPMI"):
        explicit = NegExplicit(pmi_path, normalize=False)
    else:
        explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)

    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)

    np.save(output_path + '.ut.npy', ut)
    np.save(output_path + '.s.npy', s)
    np.save(output_path + '.vt.npy', vt)
    save_vocabulary(output_path + '.words.vocab', explicit.iw)
    save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
    def _counts2PMI(self):

        words = list(self.words.keys())
        contexts = list(self.contexts.keys())
        iw = sorted(words)
        ic = sorted(contexts)
        wi = dict([(w, i) for i, w in enumerate(iw)])
        ci = dict([(c, i) for i, c in enumerate(ic)])

        counts = csr_matrix((len(wi), len(ci)), dtype=np.float32)
        tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
        update_threshold = 100000
        i = 0
        with open(self.count_pair_file) as f:
            for line in f:
                count, word, context = line.strip().split()
                if word in wi and context in ci:
                    tmp_counts[wi[word], ci[context]] = int(count)
                i += 1
                if i == update_threshold:
                    counts = counts + tmp_counts.tocsr()
                    tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32)
                    i = 0
        counts = counts + tmp_counts.tocsr()
        pmi = self.calc_pmi(counts, self.cds)

        save_matrix(self.pmi_file, pmi)
        save_vocabulary(self.pmi_file + '.words.vocab', iw)
        save_vocabulary(self.pmi_file + '.contexts.vocab', ic)
        self.explicit = PositiveExplicit(self.pmi_file, normalize=False, neg=self.neg)
        cf.saveDictionary(self.explicit,self.dict_name.split('/')[0]+'/'+self.dict_name.split('/')[1]+'_explicit_ppmi.bin')
Ejemplo n.º 5
0
def main():
    args = docopt("""
    Usage:
        text2numpy.py <path>
    """)

    path = args['<path>']

    matrix = read_vectors(path)
    iw = sorted(matrix.keys())

    new_matrix = np.zeros(shape=(len(iw), len(matrix[iw[0]])),
                          dtype=np.float32)
    for i, word in enumerate(iw):
        if word in matrix:
            new_matrix[i, :] = matrix[word]

    if np.isnan(new_matrix).any():
        print("Warning! {0} contains 1 or more `nan` values!".format(path))
        truth = np.isnan(new_matrix)
        rows = np.array([np.any(x) for x in truth], dtype=bool).nonzero()[0]
        print("Target includes {0} rows: {1}".format(rows.shape[0], rows))
        print(new_matrix[0])
        exit(-1)
    np.save(path + '.npy', new_matrix)
    save_vocabulary(path + '.vocab', iw)
Ejemplo n.º 6
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py [options] <counts> <output_path>
    
    Options:
        --cds NUM    Context distribution smoothing [default: 1.0]
    """)
    
    counts_path = args['<counts>']
    vectors_path = args['<output_path>']
    cds = float(args['--cds'])
    
    
    o = open(counts_path + '-new',"w")
    for line in open(counts_path):
        o.write(line.strip()+"\n")
    o.close()
    
    
    counts_path_new = counts_path + '-new'
    
    
    counts, iw, ic = read_counts_matrxi_fast(counts_path, counts_path_new)

    pmi = calc_pmi(counts, cds)

    save_matrix(vectors_path, pmi)
    save_vocabulary(vectors_path + '.words.vocab', iw)
    save_vocabulary(vectors_path + '.contexts.vocab', ic)
    
    savePmiNonzeroTerm_fast(counts,vectors_path + '.cooccurrence')
    

    remain_index = pmi.data > 1 

    
    pmi.data = np.log(pmi.data)
    savePmiNonzeroTerm_fast(pmi,vectors_path + '.PMI')
    
    counts.data = counts.data * remain_index
    counts.eliminate_zeros()
    savePmiNonzeroTerm_fast(counts,vectors_path + '.PPMIcooccurrence')

    

    pmi.data[pmi.data < 0] = 0
    pmi.eliminate_zeros()
    
    savePmiNonzeroTerm_fast(pmi,vectors_path + '.PPMI')
Ejemplo n.º 7
0
def text2numpy_nonewline(path):

    matrix = read_vectors(path)
    iw = sorted(matrix.keys())

    new_matrix = np.zeros(shape=(len(iw), len(matrix[iw[0]])),
                          dtype=np.float32)
    for i, word in enumerate(iw):
        if word in matrix:
            new_matrix[i, :] = matrix[word]

    npy_file = path + '.npy'
    vocab_file = path + '.vocab'

    np.save(npy_file, new_matrix)
    save_vocabulary(vocab_file, iw)

    return [npy_file, vocab_file]
Ejemplo n.º 8
0
def main():
    args = docopt("""
    Usage:
        text2numpy.py <path>
    """)
    
    path = args['<path>']
    
    matrix = read_vectors(path)
    iw = sorted(matrix.keys())
    
    new_matrix = np.zeros(shape=(len(iw), len(matrix[iw[0]])), dtype=np.float32)
    for i, word in enumerate(iw):
        if word in matrix:
            new_matrix[i, :] = matrix[word]
    
    np.save(path + '.npy', new_matrix)
    save_vocabulary(path + '.vocab', iw)
Ejemplo n.º 9
0
def main():
    args = docopt("""
    Usage:
        counts2ica.py [options] <counts> <output_path>
    
    Options:
        --cps NUM    Number of ICA components to obtain [default: 50]
    """)

    counts_path = args['<counts>']
    vectors_path = args['<output_path>']

    counts, iw, ic = read_counts_matrix(counts_path)

    embeddings = calc_ica(counts, args['--cps'])

    save_matrix(vectors_path, embeddings)
    save_vocabulary(vectors_path + '.words.vocab', iw)
    save_vocabulary(vectors_path + '.contexts.vocab', ic)
Ejemplo n.º 10
0
def main():
    args = docopt("""
    Usage:
        text2numpy.py <path>
    """)

    path = args['<path>']

    matrix = read_vectors(path)
    iw = sorted(matrix.keys())

    new_matrix = np.zeros(shape=(len(iw), len(matrix[iw[0]])),
                          dtype=np.float32)
    for i, word in enumerate(iw):
        if word in matrix:
            new_matrix[i, :] = matrix[word]

    np.save(path + '.npy', new_matrix)
    save_vocabulary(path + '.vocab', iw)
Ejemplo n.º 11
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py [options] <counts> <output_path>
    
    Options:
        --cds NUM    Context distribution smoothing [default: 1.0]
    """)

    counts_path = args['<counts>']
    vectors_path = args['<output_path>']
    cds = float(args['--cds'])

    counts, iw, ic = read_counts_matrix(counts_path)

    pmi = calc_pmi(counts, cds)

    save_matrix(vectors_path, pmi)
    save_vocabulary(vectors_path + '.words.vocab', iw)
    save_vocabulary(vectors_path + '.contexts.vocab', ic)
Ejemplo n.º 12
0
def main():
    args = docopt("""
    Usage:
        counts2pmi.py [options] <counts> <output_path>
    
    Options:
        --cds NUM    Context distribution smoothing [default: 1.0]
    """)
    
    counts_path = args['<counts>']
    vectors_path = args['<output_path>']
    cds = float(args['--cds'])
    
    counts, iw, ic = read_counts_matrix(counts_path)

    pmi = calc_pmi(counts, cds)

    save_matrix(vectors_path, pmi)
    save_vocabulary(vectors_path + '.words.vocab', iw)
    save_vocabulary(vectors_path + '.contexts.vocab', ic)
Ejemplo n.º 13
0
def main():
    args = docopt("""
    Usage:
        pmi2svd.py [options] <pmi_path> <output_path>
    
    Options:
        --dim NUM    Dimensionality of eigenvectors [default: 500]
        --neg NUM    Number of negative samples; subtracts its log from PMI [default: 1]
    """)
    
    pmi_path = args['<pmi_path>']
    output_path = args['<output_path>']
    dim = int(args['--dim'])
    neg = int(args['--neg'])
    
    explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg)

    ut, s, vt = sparsesvd(explicit.m.tocsc(), dim)

    np.save(output_path + '.ut.npy', ut)
    np.save(output_path + '.s.npy', s)
    np.save(output_path + '.vt.npy', vt)
    save_vocabulary(output_path + '.words.vocab', explicit.iw)
    save_vocabulary(output_path + '.contexts.vocab', explicit.ic)