def main(): args = docopt(""" Usage: pmi2svd.py [options] <pmi_path> <output_path> Options: --dim NUM Dimensionality of eigenvectors [default: 500] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] """) pmi_path = args['<pmi_path>'] output_path = args['<output_path>'] dim = int(args['--dim']) neg = int(args['--neg']) explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) start = time.time() ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) print("Time elapsed for SVD: %f" % (time.time() - start)) np.save(output_path + '.ut.npy', ut) np.save(output_path + '.s.npy', s) np.save(output_path + '.vt.npy', vt) save_vocabulary(output_path + '.words.vocab', explicit.iw) save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
def _counts2PMI(self): words = list(self.words.keys()) contexts = list(self.contexts.keys()) iw = sorted(words) ic = sorted(contexts) wi = dict([(w, i) for i, w in enumerate(iw)]) ci = dict([(c, i) for i, c in enumerate(ic)]) counts = csr_matrix((len(wi), len(ci)), dtype=np.float32) tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) update_threshold = 100000 i = 0 with open(self.count_pair_file) as f: for line in f: count, word, context = line.strip().split() if word in wi and context in ci: tmp_counts[wi[word], ci[context]] = int(count) i += 1 if i == update_threshold: counts = counts + tmp_counts.tocsr() tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) i = 0 counts = counts + tmp_counts.tocsr() pmi = self.calc_pmi(counts, self.cds) save_matrix(self.pmi_file, pmi) save_vocabulary(self.pmi_file + '.words.vocab', iw) save_vocabulary(self.pmi_file + '.contexts.vocab', ic) self.explicit = PositiveExplicit(self.pmi_file, normalize=False, neg=self.neg) cf.saveDictionary(self.explicit,self.dict_name.split('/')[0]+'/'+self.dict_name.split('/')[1]+'_explicit_ppmi.bin')
def create_representation(args): rep_type = args['<representation>'] path = args['<representation_path>'] neg = int(args['--neg']) w_c = args['--w+c'] eig = float(args['--eig']) if rep_type == 'PPMI': if w_c: raise Exception('w+c is not implemented for PPMI.') else: return PositiveExplicit(path, True, neg) elif rep_type == 'SVD': if w_c: return EnsembleEmbedding(SVDEmbedding(path, False, eig, False), SVDEmbedding(path, False, eig, True), True) else: return SVDEmbedding(path, True, eig) elif rep_type == 'GLOVE': return GLOVEEmbedding(path, True) else: if w_c: return EnsembleEmbedding(Embedding(path + '.words', False), Embedding(path + '.contexts', False), True) else: return Embedding(path + '.words', True)
def main(): args = docopt(""" Usage: pmi2svd.py [options] <repres> <pmi_path> <output_path> Options: --dim NUM Dimensionality of eigenvectors [default: 500] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] --k NUM [default: 1] """) repres = args['<repres>'] pmi_path = args['<pmi_path>'] output_path = args['<output_path>'] dim = int(args['--dim']) neg = int(args['--neg']) k = int(args['--k']) if (repres == "BPMI"): explicit = BinExplicit(pmi_path, normalize=False) elif (repres == "PMI"): explicit = NoExplicit(pmi_path, normalize=False, k=k) elif (repres == "NPMI"): explicit = NegExplicit(pmi_path, normalize=False) else: explicit = PositiveExplicit(pmi_path, normalize=False, neg=neg) ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) np.save(output_path + '.ut.npy', ut) np.save(output_path + '.s.npy', s) np.save(output_path + '.vt.npy', vt) save_vocabulary(output_path + '.words.vocab', explicit.iw) save_vocabulary(output_path + '.contexts.vocab', explicit.ic)
def main(): args = docopt(""" Usage: ppmi2svd.py [options] <ppmi> <output> Options: --dim NUM Dimensionality of eigenvectors [default: 300] --neg NUM Number of negative samples; subtracts its log from PMI [default: 1] """) ppmi_path = args['<ppmi>'] output_path = args['<output>'] dim = int(args['--dim']) neg = int(args['--neg']) explicit = PositiveExplicit(ppmi_path, normalize=False, neg=neg) ut, s, vt = sparsesvd(explicit.m.tocsc(), dim) np.save(output_path + '.ut.npy', ut) np.save(output_path + '.s.npy', s) np.save(output_path + '.vt.npy', vt)
def folder2chi(folder): return PositiveExplicit(join(folder, "chi")).similarity_first_order