def similarity(arg1, test_infile): with smart_open(test_infile, "r") as f: test = json.load(f) gold = np.array([float(x[2]) for x in test]) # we're given a tuple: matrix-vector composition if isinstance(arg1,tuple): if len(arg1) == 2: lf = arg1[0] emb = arg1[1] # agumented matrices if lf.A.shape[2] == lf.A.shape[1]+1: ours = np.array([1-cosine( np.dot(lf.word(x[0][0]),np.hstack((emb.word(x[0][1]),[1]))), np.dot(lf.word(x[1][0]),np.hstack((emb.word(x[1][1]),[1])))) for x in test]) # standard matrices else: ours = np.array([1-cosine( np.dot(lf.word(x[0][0]),emb.word(x[0][1])), np.dot(lf.word(x[1][0]),emb.word(x[1][1]))) for x in test]) return spearmanr(gold,ours) return TypeError("Invalid input format") # we're only given embeddings: do cosine similarity of vectors elif isinstance(arg1,Embeddings): ours = np.array([1-cosine(arg1.word(x[0]),arg1.word(x[1])) for x in test]) return spearmanr(gold,ours) return TypeError("Invalid input format")
def train_tuples(self, corpus_infile, counts_infile, epochs=1, report_freq=20): if len(self.index2sample) == 0: logger.error("attempted to start training but vocabulary has not been loaded") raise RuntimeError("You must build/load the vocabulary before training the model") epochs = int(epochs) or 1 # count the number of phrase vectors to be learned vocabsize = 0 with h5py.File(counts_infile, "r") as fcount: phrase_index2count = fcount["index2count"][:] phrase_index2name = fcount["index2name"][:] vocabsize = len(phrase_index2count) # initialise temporary work memory and phrase vectors work = np.zeros(self.dim, dtype=floatX) embeddings = np.ascontiguousarray((np.random.rand(vocabsize, self.dim) - 0.5) / self.dim,dtype=floatX) logger.info("initialised a %s x %s phrase embedding matrix", si(vocabsize), si(self.dim)) with smart_open(corpus_infile, 'r') as fin: total_words = 0 # read the number of sentences in the corpus corpus_sentences = int(next(fin).strip()) total_sentences = epochs * corpus_sentences logger.info("loaded corpus with %s examples, training for %d epochs", si(corpus_sentences), epochs) t = Timer(interval=report_freq) t.tic() word_count = 0 for epoch in range(epochs): fin.seek(0) next(fin) # skip first line with number of sentences for sentence_num, line in enumerate(fin,start=epoch*corpus_sentences): sentence = list(map(int,line.strip().split()))[:self.window+1] if len(sentence) <= 1: continue alpha = self.alpha * (1 - sentence_num / total_sentences) word_count += len(sentence)-1 train_tuple(self, sentence, alpha, embeddings, work) if t.ready(): t.toc() logger.info("%.2f%% examples @ %s words/s, alpha %.6f" % (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha)) total_words += word_count word_count = 0 total_words += word_count logger.info("trained on %s words (%s examples) in %s @ %s words/s" % (si(total_words), si(total_sentences), t.toc(hms=True), si(total_words / t.toc()))) return Embeddings(embeddings, phrase_index2name, phrase_index2count)
def train_sentences(self, corpus_infile, epochs=1, report_freq=20): if len(self.index2sample) == 0: logger.error("attempted to start training but vocabulary has not been loaded") raise RuntimeError("You must build/load the vocabulary before training the model") epochs = int(epochs) or 1 # initialise temporary work memory and word vectors work = np.zeros(self.dim, dtype=floatX) embeddings = np.ascontiguousarray((np.random.rand(len(self.index2name), self.dim) - 0.5) / self.dim,dtype=floatX) logger.info("initialised a %s x %s embedding matrix", si(len(self.index2name)), si(self.dim)) with smart_open(corpus_infile, 'r') as fin: total_words = 0 # read the number of sentences in the corpus corpus_sentences = int(next(fin).strip()) total_sentences = epochs * corpus_sentences logger.info("loaded corpus with %s sentences, training for %d epochs", si(corpus_sentences), epochs) t = Timer(interval=report_freq) t.tic() word_count = 0 for epoch in range(epochs): fin.seek(0) next(fin) # skip first line with number of sentences for sentence_num, line in enumerate(fin,start=epoch*corpus_sentences): alpha = self.alpha * (1 - sentence_num / total_sentences) sentence = list(map(int,line.strip().split())) word_count += len(sentence) train_sentence(self, sentence, alpha, embeddings, work) if t.ready(): t.toc() if self.dev: cor = self.test_dev(embeddings) logger.info("%.2f%% sentences @ %s words/s, alpha %.6f, corr %.5f (p %.2e)" % (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha, cor[0], cor[1])) else: logger.info("%.2f%% sentences @ %s words/s, alpha %.6f" % (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha)) total_words += word_count word_count = 0 total_words += word_count logger.info("trained on %s sentences (%s words) in %s @ %s words/s" % (si(total_sentences), si(total_words), t.toc(hms=True), si(total_words / t.toc()))) cor = self.test_dev(embeddings) logger.info("correlation on development set %.5f (p %.2e)" % cor) return Embeddings(embeddings, self.index2name, self.index2count)
def meanap(relpr_s, relpr_o, verb_s, verb_o, noun, test_infile): with smart_open(test_infile, "r") as f: test = json.load(f) # extract target nouns target_nouns = set(t for _,t,*_ in test) counts = Counter(t for _,t,*_ in test) # compose relative clauses relative_clauses = [] for which, t, n1, relpr, v, n2 in test: #relative_clauses.append((t, np.dot(verb_o.word(v), noun.word(n2)))) if which == "SBJ": #relative_clauses.append((t, noun.word(n1)+noun.word(v)+noun.word(n2))) #relative_clauses.append((t, noun.word(v)+noun.word(n2))) #relative_clauses.append((t, noun.word(n2))) #relative_clauses.append((t, noun.word(n1)+np.dot(verb_o.word(v),noun.word(n2)))) #relative_clauses.append((t, np.dot(relpr_s.word(relpr), # np.outer(noun.word(n1), # noun.word(v)+noun.word(n2)).flatten()))) relative_clauses.append((t, np.dot(relpr_s.word(relpr), np.outer(noun.word(n1), np.dot(verb_o.word(v),noun.word(n2))).flatten()))) else: #relative_clauses.append((t, noun.word(n1)+noun.word(v)+noun.word(n2))) #relative_clauses.append((t, noun.word(v)+noun.word(n2))) #relative_clauses.append((t, noun.word(n2))) #relative_clauses.append((t, noun.word(n1)+np.dot(verb_s.word(v),noun.word(n2)))) #relative_clauses.append((t, np.dot(relpr_o.word(relpr), # np.outer(noun.word(n1), # noun.word(v)+noun.word(n2)).flatten()))) relative_clauses.append((t,np.dot(relpr_o.word(relpr), np.outer(noun.word(n1), np.dot(verb_s.word(v),noun.word(n2))).flatten()))) scores = [] for target in target_nouns: #print(target) predicted = [(t, 1-cosine(noun.word(target),v)) for t,v in relative_clauses] predicted.sort(key=lambda x: x[1], reverse=True) ap = _ap(target, [t for t,*_ in predicted], counts[target]) #print((target, [(t.upper(),r) if t == target else (t,r) for t,_,r in predicted]), ap) scores.append(ap) return np.mean(scores)