def train_tuples(self, corpus_infile, counts_infile, epochs=1, report_freq=20): if len(self.index2sample) == 0: logger.error("attempted to start training but vocabulary has not been loaded") raise RuntimeError("You must build/load the vocabulary before training the model") epochs = int(epochs) or 1 # count the number of phrase vectors to be learned vocabsize = 0 with h5py.File(counts_infile, "r") as fcount: phrase_index2count = fcount["index2count"][:] phrase_index2name = fcount["index2name"][:] vocabsize = len(phrase_index2count) # initialise temporary work memory and phrase vectors work = np.zeros(self.dim, dtype=floatX) embeddings = np.ascontiguousarray((np.random.rand(vocabsize, self.dim) - 0.5) / self.dim,dtype=floatX) logger.info("initialised a %s x %s phrase embedding matrix", si(vocabsize), si(self.dim)) with smart_open(corpus_infile, 'r') as fin: total_words = 0 # read the number of sentences in the corpus corpus_sentences = int(next(fin).strip()) total_sentences = epochs * corpus_sentences logger.info("loaded corpus with %s examples, training for %d epochs", si(corpus_sentences), epochs) t = Timer(interval=report_freq) t.tic() word_count = 0 for epoch in range(epochs): fin.seek(0) next(fin) # skip first line with number of sentences for sentence_num, line in enumerate(fin,start=epoch*corpus_sentences): sentence = list(map(int,line.strip().split()))[:self.window+1] if len(sentence) <= 1: continue alpha = self.alpha * (1 - sentence_num / total_sentences) word_count += len(sentence)-1 train_tuple(self, sentence, alpha, embeddings, work) if t.ready(): t.toc() logger.info("%.2f%% examples @ %s words/s, alpha %.6f" % (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha)) total_words += word_count word_count = 0 total_words += word_count logger.info("trained on %s words (%s examples) in %s @ %s words/s" % (si(total_words), si(total_sentences), t.toc(hms=True), si(total_words / t.toc()))) return Embeddings(embeddings, phrase_index2name, phrase_index2count)
def train_sentences(self, corpus_infile, epochs=1, report_freq=20): if len(self.index2sample) == 0: logger.error("attempted to start training but vocabulary has not been loaded") raise RuntimeError("You must build/load the vocabulary before training the model") epochs = int(epochs) or 1 # initialise temporary work memory and word vectors work = np.zeros(self.dim, dtype=floatX) embeddings = np.ascontiguousarray((np.random.rand(len(self.index2name), self.dim) - 0.5) / self.dim,dtype=floatX) logger.info("initialised a %s x %s embedding matrix", si(len(self.index2name)), si(self.dim)) with smart_open(corpus_infile, 'r') as fin: total_words = 0 # read the number of sentences in the corpus corpus_sentences = int(next(fin).strip()) total_sentences = epochs * corpus_sentences logger.info("loaded corpus with %s sentences, training for %d epochs", si(corpus_sentences), epochs) t = Timer(interval=report_freq) t.tic() word_count = 0 for epoch in range(epochs): fin.seek(0) next(fin) # skip first line with number of sentences for sentence_num, line in enumerate(fin,start=epoch*corpus_sentences): alpha = self.alpha * (1 - sentence_num / total_sentences) sentence = list(map(int,line.strip().split())) word_count += len(sentence) train_sentence(self, sentence, alpha, embeddings, work) if t.ready(): t.toc() if self.dev: cor = self.test_dev(embeddings) logger.info("%.2f%% sentences @ %s words/s, alpha %.6f, corr %.5f (p %.2e)" % (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha, cor[0], cor[1])) else: logger.info("%.2f%% sentences @ %s words/s, alpha %.6f" % (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha)) total_words += word_count word_count = 0 total_words += word_count logger.info("trained on %s sentences (%s words) in %s @ %s words/s" % (si(total_sentences), si(total_words), t.toc(hms=True), si(total_words / t.toc()))) cor = self.test_dev(embeddings) logger.info("correlation on development set %.5f (p %.2e)" % cor) return Embeddings(embeddings, self.index2name, self.index2count)
def learn_rc_tensors(words, modifiers, compounds, weight=(lambda c: np.log(c)), which="orc", min_count=2, parameter=80.0, report_freq=20): logger.info("regularisation parameter: %s", str(parameter)) if which == "orc": arg1 = 3 arg2 = 2 logger.info("learning object relative pronouns") else: arg1 = 2 arg2 = 3 logger.info("learning subject relative pronouns") # build LexicalFunctions object lfs_counter = Counter((m for _,m,*_ in compounds.index2name)) lfs_index2name = [m for m,_ in lfs_counter.most_common()] vocabsize = len(lfs_index2name) dim = words.shape[1] arr = np.zeros((vocabsize, dim, dim*dim), dtype=floatX) eye = np.eye(dim*dim) lfs = LexicalFunctions(arr, lfs_index2name) logger.info("initialised a %s x %s x %s tensor embedding tensor", si(vocabsize), si(dim), si(dim*dim)) t = Timer(interval=report_freq) # build a list of training examples for each modifier t.tic() phrase_examples = defaultdict(list) for i, z in enumerate(zip(compounds.index2name, compounds.index2count)): if z[1] >= min_count and z[0][arg1] in modifiers.name2index: phrase_examples[z[0][1]].append((words.name2index[z[0][0]], modifiers.name2index[z[0][arg1]], words.name2index[z[0][arg2]], i, z[1])) logger.info("Examples built in "+t.toc(hms=True)) # solve AX = B for each modifier t.tic() for ex_num, ex in enumerate(phrase_examples.items()): lf_name, examples = ex if len(examples) < 1: continue B = np.zeros((len(examples), dim)) A = np.zeros((len(examples), dim * dim)) for i,z in enumerate(examples): noun1_index, modifier_index, noun2_index, phrase_index, count = z w = weight(count) A[i] = w * np.outer(words[noun1_index], np.dot(modifiers[modifier_index], words[noun2_index])).flatten() B[i] = w * compounds[phrase_index] tmp1 = pinv(np.dot(A.T, A) + parameter*eye) tmp2 = np.dot(A.T, B) lfs.A[lfs.name2index[lf_name]] = np.dot(tmp1, tmp2).T if t.ready(): t.toc() logger.info("%.2f%% matrices (%s, %s)" % (100 * (ex_num+1) / vocabsize, si(ex_num+1), t.toc(hms=True))) logger.info("learned %s tensors in %s" % (si(vocabsize), t.toc(hms=True))) return lfs
def learn_matrices(words, compounds, weight=(lambda c: np.log(c)), min_count=2, parameter=80, report_freq=20): # build LexicalFunctions object lfs_counter = Counter((a for a,_ in compounds.index2name)) lfs_index2name = [a for a,_ in lfs_counter.most_common()] vocabsize = len(lfs_index2name) dim = words.shape[1] arr = np.zeros((vocabsize, dim, dim), dtype=floatX) eye = np.eye(dim) lfs = LexicalFunctions(arr, lfs_index2name) logger.info("initialised a %s x %s x %s matrix embedding tensor", si(vocabsize), si(dim), si(dim)) t = Timer(interval=report_freq) # build a list of training examples for each modifier t.tic() phrase_examples = defaultdict(list) for i, z in enumerate(zip(compounds.index2name, compounds.index2count)): if z[1] >= min_count: phrase_examples[z[0][0]].append((words.name2index[z[0][1]], i, z[1])) logger.info("Examples built in "+t.toc(hms=True)) # solve AX = B for each modifier t.tic() for ex_num, ex in enumerate(phrase_examples.items()): lf_name, examples = ex if len(examples) < 1: continue B = np.zeros((len(examples), dim)) A = np.zeros((len(examples),dim)) for i, z in enumerate(examples): noun_index, phrase_index, count = z w = weight(z[2]) A[i] = w * words[z[0]] B[i] = w * compounds[z[1]] tmp1 = pinv(np.dot(A.T, A) + parameter * eye) tmp2 = np.dot(A.T, B) lfs.A[lfs.name2index[lf_name]] = np.dot(tmp1, tmp2).T if t.ready(): t.toc() logger.info("%.2f%% matrices (%s)" % (100 * ex_num / vocabsize, si(ex_num+1))) logger.info("learned %s matrices in %s" % (si(vocabsize), t.toc(hms=True))) return lfs