def _counts2PMI(self): words = list(self.words.keys()) contexts = list(self.contexts.keys()) iw = sorted(words) ic = sorted(contexts) wi = dict([(w, i) for i, w in enumerate(iw)]) ci = dict([(c, i) for i, c in enumerate(ic)]) counts = csr_matrix((len(wi), len(ci)), dtype=np.float32) tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) update_threshold = 100000 i = 0 with open(self.count_pair_file) as f: for line in f: count, word, context = line.strip().split() if word in wi and context in ci: tmp_counts[wi[word], ci[context]] = int(count) i += 1 if i == update_threshold: counts = counts + tmp_counts.tocsr() tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) i = 0 counts = counts + tmp_counts.tocsr() pmi = self.calc_pmi(counts, self.cds) save_matrix(self.pmi_file, pmi) save_vocabulary(self.pmi_file + '.words.vocab', iw) save_vocabulary(self.pmi_file + '.contexts.vocab', ic) self.explicit = PositiveExplicit(self.pmi_file, normalize=False, neg=self.neg) cf.saveDictionary(self.explicit,self.dict_name.split('/')[0]+'/'+self.dict_name.split('/')[1]+'_explicit_ppmi.bin')
def main(): args = docopt(""" Usage: counts2pmi.py [options] <counts> <output_path> Options: --cds NUM Context distribution smoothing [default: 1.0] """) counts_path = args['<counts>'] vectors_path = args['<output_path>'] cds = float(args['--cds']) #words = load_count_vocabulary(counts_path + '.words.vocab') #contexts = load_count_vocabulary(counts_path + '.contexts.vocab') #loader = np.load(counts_path+'.pairs.counts.npz') #counts = csr_matrix((loader['data'], loader['indices'], loader['indptr'])) #counts, iw, ic = read_counts_matrix(counts_path) #iw = sorted(words) #ic = sorted(contexts) counts, iw, ic = read_counts_matrix(counts_path) pmi = calc_pmi(counts, cds, alpha=1.0) #words = load_count_vocabulary(counts_path + '.words.vocab') #contexts = load_count_vocabulary(counts_path + '.contexts.vocab') #iw = sorted(words) #ic = sorted(contexts) save_matrix(vectors_path + '.count_matrix', counts) save_matrix(vectors_path, pmi) save_vocabulary(vectors_path + '.words.vocab', iw) save_vocabulary(vectors_path + '.contexts.vocab', ic)
def main(): args = docopt(""" Usage: counts2pmi.py [options] <counts> <output_path> Options: --cds NUM Context distribution smoothing [default: 1.0] """) counts_path = args['<counts>'] vectors_path = args['<output_path>'] cds = float(args['--cds']) o = open(counts_path + '-new',"w") for line in open(counts_path): o.write(line.strip()+"\n") o.close() counts_path_new = counts_path + '-new' counts, iw, ic = read_counts_matrxi_fast(counts_path, counts_path_new) pmi = calc_pmi(counts, cds) save_matrix(vectors_path, pmi) save_vocabulary(vectors_path + '.words.vocab', iw) save_vocabulary(vectors_path + '.contexts.vocab', ic) savePmiNonzeroTerm_fast(counts,vectors_path + '.cooccurrence') remain_index = pmi.data > 1 pmi.data = np.log(pmi.data) savePmiNonzeroTerm_fast(pmi,vectors_path + '.PMI') counts.data = counts.data * remain_index counts.eliminate_zeros() savePmiNonzeroTerm_fast(counts,vectors_path + '.PPMIcooccurrence') pmi.data[pmi.data < 0] = 0 pmi.eliminate_zeros() savePmiNonzeroTerm_fast(pmi,vectors_path + '.PPMI')
def main(): args = docopt(""" Usage: counts2ica.py [options] <counts> <output_path> Options: --cps NUM Number of ICA components to obtain [default: 50] """) counts_path = args['<counts>'] vectors_path = args['<output_path>'] counts, iw, ic = read_counts_matrix(counts_path) embeddings = calc_ica(counts, args['--cps']) save_matrix(vectors_path, embeddings) save_vocabulary(vectors_path + '.words.vocab', iw) save_vocabulary(vectors_path + '.contexts.vocab', ic)
def main(): args = docopt(""" Usage: counts2pmi.py [options] <counts> <output_path> Options: --cds NUM Context distribution smoothing [default: 1.0] """) counts_path = args['<counts>'] vectors_path = args['<output_path>'] cds = float(args['--cds']) counts, iw, ic = read_counts_matrix(counts_path) pmi = calc_pmi(counts, cds) save_matrix(vectors_path, pmi) save_vocabulary(vectors_path + '.words.vocab', iw) save_vocabulary(vectors_path + '.contexts.vocab', ic)
def main(): args = docopt(""" Usage: counts2pmi.py [options] <counts> <output_path> Options: --cds NUM Context distribution smoothing [default: 1.0] """) counts_path = args['<counts>'] vectors_path = args['<output_path>'] cds = float(args['--cds']) counts, iw, ic = read_counts_matrix(counts_path) pmi = calc_pmi(counts, cds) save_matrix(vectors_path, pmi) save_vocabulary(vectors_path + '.words.vocab', iw) save_vocabulary(vectors_path + '.contexts.vocab', ic)
def main(): args = docopt(""" Usage: counts2pmi.py [options] <words_vocab> <contexts_vocab> <counts> <output> Options: --cds NUM Context distribution smoothing [default: 1.0] """) print "**********************" print "counts2ppmi" counts_path = args['<counts>'] vectors_path = args['<output>'] words_path = args['<words_vocab>'] contexts_path = args['<contexts_vocab>'] cds = float(args['--cds']) counts = read_counts_matrix(words_path, contexts_path, counts_path) pmi = calc_pmi(counts, cds) save_matrix(vectors_path, pmi)
def main(): args = docopt(""" Usage: counts2pmi.py [options] <words_vocab> <contexts_vocab> <counts> <output> Options: --cds NUM Context distribution smoothing [default: 1.0] """) print "**********************" print "counts2ppmi" counts_path = args['<counts>'] vectors_path = args['<output>'] words_path = args['<words_vocab>'] contexts_path = args['<contexts_vocab>'] cds = float(args['--cds']) counts = read_counts_matrix(words_path, contexts_path, counts_path) pmi = calc_pmi(counts, cds) save_matrix(vectors_path, pmi)