def load_set(files, vocab=None, skip_unlabeled=True, spad=spad): def load_file(fname, skip_unlabeled=True): # XXX: ugly logic if 'sick2014' in fname: return loader.load_sick2014(fname) else: return loader.load_sts(fname, skip_unlabeled=skip_unlabeled) try: strtype = basestring except NameError: strtype = str if isinstance(files, strtype): s0, s1, y = load_file(files, skip_unlabeled=skip_unlabeled) else: s0, s1, y = loader.concat_datasets( [load_file(d, skip_unlabeled=skip_unlabeled) for d in files]) if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=spad) si1 = vocab.vectorize(s1, spad=spad) f0, f1 = nlp.sentence_flags(s0, s1, spad, spad) gr = graph_input_sts(si0, si1, y, f0, f1, s0, s1) return (s0, s1, y, vocab, gr)
def load_set(self, fname, cache_dir=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % ( cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True s0, s1, y = loader.load_hypev(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1) # FIXME: lower? else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) gr, y = self.merge_questions(gr) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad, cache_dir=None, skip_oneclass=True): """ Caching: If cache_dir is set: it tries to load finished dataset from it (filename of cache is hash of fname), and if that fails, it will compute dataset and try to save it.""" save_cache = False if cache_dir: fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache=True s0, s1, y, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass) # TODO: Make use of the t-annotations if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (s0, s1, y, vocab, gr)
def load_set(self, fname, cache_dir=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True skip_oneclass = self.c.get('skip_oneclass', True) s0, s1, y, kw, akw, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass) # TODO: Make use of the t-annotations if self.vocab is None: vocab = Vocabulary(s0 + s1) else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1, kw=kw, akw=akw) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(fname, vocab=None): s0, s1, y, _, _, _ = loader.load_anssel(fname) if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0) si1 = vocab.vectorize(s1) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1) return (s0, s1, y, vocab, gr)
def load_set(fname, vocab=None): s0, s1, y, t = loader.load_anssel(fname) if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0) si1 = vocab.vectorize(s1) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1) return (s0, s1, y, vocab, gr)
def load_sent(q, a, vocab=None): s0, s1, y = [q], [a], 1 # s0=questions, s1=answers if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1) return gr
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad): s0, s1, y, t = loader.load_anssel(fname) # TODO: Make use of the t-annotations if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return (s0, s1, y, vocab, gr)
def load_set(fname, vocab=None): s0, s1, y = loader.load_hypev(fname) # s0=questions, s1=answers if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return s0, s1, y, vocab, gr
def load_set(self, fname, cache_dir=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % ( cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True skip_oneclass = self.c.get('skip_oneclass', True) s0, s1, y, kw, akw, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass) # TODO: Make use of the t-annotations if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1, kw=kw, akw=akw) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(self, fname): s0, s1, y = loader.load_sick2014(fname, mode='entailment') if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1) return (gr, y, vocab)
def load_set(self, fname, cache_dir=None): s0, s1, y = loader.load_hypev(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1) else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return (gr, y, vocab)
def load_set(self, fname): s0, s1, y = loader.load_msrpara(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1) else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return (gr, y, vocab)
def load_set(fname, emb, vocab=None): s0, s1, y, _, _, _ = loader.load_anssel(fname) if vocab is None: vocab = Vocabulary(s0 + s1) si0, sj0 = vocab.vectorize(s0, emb) si1, sj1 = vocab.vectorize(s1, emb) se0 = emb.map_jset(sj0) se1 = emb.map_jset(sj1) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1) # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint # for the dataset. In KeraSTS, we solve this by using fit_generator (also # because of epoch_fract) and embed just per-batch. return (s0, s1, y, vocab, gr)
def load_set(files, vocab=None, skip_unlabeled=True): def load_file(fname, skip_unlabeled=True): # XXX: ugly logic if 'sick2014' in fname: return loader.load_sick2014(fname) else: return loader.load_sts(fname, skip_unlabeled=skip_unlabeled) s0, s1, y = loader.concat_datasets([load_file(d, skip_unlabeled=skip_unlabeled) for d in files]) if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=spad) si1 = vocab.vectorize(s1, spad=spad) f0, f1 = nlp.sentence_flags(s0, s1, spad, spad) gr = graph_input_sts(si0, si1, y, f0, f1) return (s0, s1, y, vocab, gr)
def load_set(self, fname): def load_file(fname, skip_unlabeled=True): # XXX: ugly logic if 'sick2014' in fname: return loader.load_sick2014(fname) else: return loader.load_sts(fname, skip_unlabeled=skip_unlabeled) s0, s1, y = load_file(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1) else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_sts(si0, si1, y, f0, f1, s0, s1) return (gr, y, vocab)
def load_set(self, fname): def load_file(fname, skip_unlabeled=True): # XXX: ugly logic if 'sick2014' in fname: return loader.load_sick2014(fname) else: return loader.load_sts(fname, skip_unlabeled=skip_unlabeled) s0, s1, y = load_file(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_sts(si0, si1, sj0, sj1, y, f0, f1, s0, s1) return (gr, y, vocab)
def load_set(self, fname, lists=None): if lists: s0, s1, y = lists else: # s0, s1, y = loader.load_msrpara(fname) #set it free is we decide not to use quora dataset s0, s1, y = loader.load_quora(fname) if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_nparray_anssel( graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1)) return (gr, y, vocab)
def load_vocab(self, vocabf): self.texts = loader.load_askubuntu_texts(vocabf) self.vocab = Vocabulary(self.texts.values()) return self.vocab
def load_set(self, fname, cache_dir=None, lists=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % ( cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True if lists is not None: s0, s1, y, qids, xtra, types = lists else: xtra = None if '/mc' in fname: s0, s1, y, qids, types = loader.load_mctest(fname) else: s0, s1, y, qids = loader.load_hypev(fname) try: dsfile = re.sub('\.([^.]*)$', '_aux.tsv', fname) # train.tsv -> train_aux.tsv with open(dsfile) as f: rows = csv.DictReader(f, delimiter='\t') xtra = loader.load_hypev_xtra(rows) print(dsfile + ' loaded and available') except Exception as e: if self.c['aux_r'] or self.c['aux_c']: raise e types = None if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab # mcqtypes pruning must happen *after* Vocabulary has been constructed! if types is not None: s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']] s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']] y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']] qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']] print( 'Retained %d questions, %d hypotheses (%s types)' % (len(set(qids)), len(set([' '.join(s) for s in s0])), self.c['mcqtypes'])) si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1) if qids is not None: gr['qids'] = qids if xtra is not None: gr['#'] = xtra['#'] gr['@'] = xtra['@'] gr, y = self.merge_questions(gr) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
f0_, f1_ = nlp.sentence_flags(s0, s1, spad, spad) return (si0, si1, sj0, sj1, f0_, f1_, labels) if __name__ == "__main__": args = sys.argv[1:] if args[0] == '--revocab': revocab = True args = args[1:] else: revocab = False trainf, valf, testf, dumptrainf, dumpvalf, dumptestf, vocabf = args if revocab: vocab = Vocabulary(sentence_gen([trainf]), count_thres=2) print('%d words' % (len(vocab.word_idx))) pickle.dump(vocab, open(vocabf, "wb")) else: vocab = pickle.load(open(vocabf, "rb")) print('%d words' % (len(vocab.word_idx))) glove = emb.GloVe(N=300) # XXX: hardcoded print('Preprocessing train file') si0, si1, sj0, sj1, f0_, f1_, labels = load_set(trainf, vocab, glove) pickle.dump((si0, si1, sj0, sj1, f0_, f1_, labels), open(dumptrainf, "wb")) print('Preprocessing validation file') si0, si1, sj0, sj1, f0_, f1_, labels = load_set(valf, vocab, glove) pickle.dump((si0, si1, sj0, sj1, f0_, f1_, labels), open(dumpvalf, "wb"))
if i > MAX_SAMPLES: break return (s0i, s1i, s0j, s1j, f0, f1, labels) if __name__ == "__main__": args = sys.argv[1:] if args[0] == '--revocab': revocab = True args = args[1:] else: revocab = False dataf, dumpf, vocabf = args glove = emb.GloVe(N=300) # XXX: hardcoded if revocab: vocab = Vocabulary(sentence_gen(dataf), count_thres=2, prune_N=100) print('%d words' % (len(vocab.word_idx))) pickle.dump(vocab, open(vocabf, "wb")) else: vocab = pickle.load(open(vocabf, "rb")) print('%d words' % (len(vocab.word_idx))) s0i, s1i, s0j, s1j, f0, f1, labels = load_set(dataf, vocab, glove) pickle.dump((s0i, s1i, s0j, s1j, f0, f1, labels), open(dumpf, "wb")) # glove = emb.GloVe(N=300)
def load_vocab(self, vocabf): self.texts = loader.load_askubuntu_texts(vocabf) self.vocab = Vocabulary(self.texts.values(), prune_N=self.c['embprune'], icase=self.c['embicase']) return self.vocab
class AskUTask(ParaphrasingTask): def __init__(self): self.name = 'asku' self.s0pad = 60 self.s1pad = 60 self.emb = None self.vocab = None def config(self, c): c['loss'] = ranknet c['nb_epoch'] = 16 c['batch_size'] = 192 c['epoch_fract'] = 1 / 4 def load_vocab(self, vocabf): self.texts = loader.load_askubuntu_texts(vocabf) self.vocab = Vocabulary(self.texts.values(), prune_N=self.c['embprune'], icase=self.c['embicase']) return self.vocab def load_set(self, fname, cache_dir=None): links = loader.load_askubuntu_q(fname) return links def link_to_s(self, link): # convert link in the askubuntu_q format to a set of sentence pairs pid, qids, qlabels = link s0 = [] s1 = [] labels = [] for qid, ql in zip(qids, qlabels): s0.append(self.texts[pid]) s1.append(self.texts[qid]) labels.append(ql) return s0, s1, labels def links_to_graph(self, links): s0 = [] s1 = [] labels = [] for link in links: s0l, s1l, labelsl = self.link_to_s(link) s0 += s0l s1 += s1l labels += labelsl si0, sj0 = self.vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = self.vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, np.array(labels), f0, f1, s0, s1) return gr def load_data(self, trainf, valf, testf=None): self.trainf = trainf self.valf = valf self.testf = testf if self.vocab is None: # XXX: this vocab includes even val,test words! self.load_vocab(os.path.dirname(trainf) + '/text_tokenized.txt.gz') self.links = self.load_set(trainf) from itertools import chain self.gr = { 'score': list(chain.from_iterable([l[2] for l in self.links])) } print('Training set: %d links, %d sentence pairs' % (len(self.links), len(self.gr['score']))) self.linksv = self.load_set(valf) self.grv = self.links_to_graph(self.linksv) if testf is not None: self.linkst = self.load_set(testf) self.grt = self.links_to_graph(self.linksv) else: self.linkst = None def sample_pairs(self, batch_size, once=False): """ A generator that produces random pairs from the dataset """ ids = range(len(self.links)) while True: random.shuffle(ids) links_to_yield = [] n_yielded = 0 for i in ids: link = self.links[i] links_to_yield.append(link) n_yielded += len(link[1]) if n_yielded < batch_size: continue # we have accumulated enough pairs, produce a graph ogr = self.links_to_graph(links_to_yield) links_to_yield = [] n_yielded = 0 yield ogr if once: break def fit_callbacks(self, weightsf): return [ AnsSelCB(self.grv['si0'], self.grv), ModelCheckpoint(weightsf, save_best_only=True), EarlyStopping(patience=3) ] def fit_model(self, model, **kwargs): batch_size = kwargs.pop('batch_size') kwargs['callbacks'] = self.fit_callbacks(kwargs.pop('weightsf')) return model.fit_generator(self.sample_pairs(batch_size), **kwargs) def eval(self, model): res = [None] for gr, fname in [(self.grv, self.valf), (self.grt, self.testf)]: if gr is None: res.append(None) continue ypred = model.predict(gr)['score'][:, 0] res.append(ev.eval_ubuntu(ypred, gr['si0'], gr['score'], fname)) return tuple(res) def res_columns(self, mres, pfx=' '): """ Produce README-format markdown table row piece summarizing important statistics """ return ( '%s%.6f |%s%.6f |%s%.6f |%s%.6f |%s%.6f |%s%.6f ' % (pfx, mres[self.valf]['MRR'], pfx, mres[self.valf]['R10_1'], pfx, mres[self.valf]['R10_5'], pfx, mres[self.testf].get( 'MRR', np.nan), pfx, mres[self.testf].get('R10_1', np.nan), pfx, mres[self.testf].get('R10_5', np.nan)))
class AskUTask(ParaphrasingTask): def __init__(self): self.name = 'asku' self.s0pad = 60 self.s1pad = 60 self.emb = None self.vocab = None def config(self, c): c['loss'] = ranknet c['nb_epoch'] = 16 c['batch_size'] = 192 c['epoch_fract'] = 1/4 def load_vocab(self, vocabf): self.texts = loader.load_askubuntu_texts(vocabf) self.vocab = Vocabulary(self.texts.values()) return self.vocab def load_set(self, fname, cache_dir=None): links = loader.load_askubuntu_q(fname) return links def link_to_s(self, link): # convert link in the askubuntu_q format to a set of sentence pairs pid, qids, qlabels = link s0 = [] s1 = [] labels = [] for qid, ql in zip(qids, qlabels): s0.append(self.texts[pid]) s1.append(self.texts[qid]) labels.append(ql) return s0, s1, labels def links_to_graph(self, links): s0 = [] s1 = [] labels = [] for link in links: s0l, s1l, labelsl = self.link_to_s(link) s0 += s0l s1 += s1l labels += labelsl si0 = self.vocab.vectorize(s0, spad=self.s0pad) si1 = self.vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, np.array(labels), f0, f1) return gr def load_data(self, trainf, valf, testf=None): self.trainf = trainf self.valf = valf self.testf = testf if self.vocab is None: # XXX: this vocab includes even val,test words! self.load_vocab(os.path.dirname(trainf) + '/text_tokenized.txt.gz') self.links = self.load_set(trainf) from itertools import chain self.gr = {'score': list(chain.from_iterable([l[2] for l in self.links]))} print('Training set: %d links, %d sentence pairs' % (len(self.links), len(self.gr['score']))) self.linksv = self.load_set(valf) self.grv = self.links_to_graph(self.linksv) if testf is not None: self.linkst = self.load_set(testf) self.grt = self.links_to_graph(self.linksv) else: self.linkst = None def sample_pairs(self, batch_size, once=False): """ A generator that produces random pairs from the dataset """ ids = range(len(self.links)) while True: random.shuffle(ids) links_to_yield = [] n_yielded = 0 for i in ids: link = self.links[i] links_to_yield.append(link) n_yielded += len(link[1]) if n_yielded < batch_size: continue # we have accumulated enough pairs, produce a graph ogr = self.links_to_graph(links_to_yield) links_to_yield = [] n_yielded = 0 yield ogr if once: break def fit_callbacks(self, weightsf): return [AnsSelCB(self.grv['si0'], self.grv), ModelCheckpoint(weightsf, save_best_only=True), EarlyStopping(patience=3)] def fit_model(self, model, **kwargs): batch_size = kwargs.pop('batch_size') kwargs['callbacks'] = self.fit_callbacks(kwargs.pop('weightsf')) return model.fit_generator(self.sample_pairs(batch_size), **kwargs) def eval(self, model): res = [None] for gr, fname in [(self.grv, self.valf), (self.grt, self.testf)]: if gr is None: res.append(None) continue ypred = model.predict(gr)['score'][:,0] res.append(ev.eval_ubuntu(ypred, gr['si0'], gr['score'], fname)) return tuple(res) def res_columns(self, mres, pfx=' '): """ Produce README-format markdown table row piece summarizing important statistics """ return('%s%.6f |%s%.6f |%s%.6f |%s%.6f |%s%.6f |%s%.6f ' % (pfx, mres[self.valf]['MRR'], pfx, mres[self.valf]['R10_1'], pfx, mres[self.valf]['R10_5'], pfx, mres[self.testf].get('MRR', np.nan), pfx, mres[self.testf].get('R10_1', np.nan), pfx, mres[self.testf].get('R10_5', np.nan)))
labels.append(int(label)) i += 1 if i > MAX_SAMPLES: break return (s0i, s1i, f0, f1, labels) if __name__ == "__main__": args = sys.argv[1:] if args[0] == '--revocab': revocab = True args = args[1:] else: revocab = False dataf, dumpf, vocabf = args if revocab: vocab = Vocabulary(sentence_gen(dataf), count_thres=2) print('%d words' % (len(vocab.word_idx))) pickle.dump(vocab, open(vocabf, "wb")) else: vocab = pickle.load(open(vocabf, "rb")) print('%d words' % (len(vocab.word_idx))) s0i, s1i, f0, f1, labels = load_set(dataf, vocab) pickle.dump((s0i, s1i, f0, f1, labels), open(dumpf, "wb")) # glove = emb.GloVe(N=300)
def load_set(self, fname, cache_dir=None, lists=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True if lists is not None: s0, s1, y, qids, xtra, types = lists else: xtra = None if '/mc' in fname: s0, s1, y, qids, types = loader.load_mctest(fname) else: s0, s1, y, qids = loader.load_hypev(fname) try: dsfile = re.sub('\.([^.]*)$', '_aux.tsv', fname) # train.tsv -> train_aux.tsv with open(dsfile) as f: rows = csv.DictReader(f, delimiter='\t') xtra = loader.load_hypev_xtra(rows) print(dsfile + ' loaded and available') except Exception as e: if self.c['aux_r'] or self.c['aux_c']: raise e types = None if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab # mcqtypes pruning must happen *after* Vocabulary has been constructed! if types is not None: s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']] s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']] y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']] qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']] print('Retained %d questions, %d hypotheses (%s types)' % (len(set(qids)), len(set([' '.join(s) for s in s0])), self.c['mcqtypes'])) si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1) if qids is not None: gr['qids'] = qids if xtra is not None: gr['#'] = xtra['#'] gr['@'] = xtra['@'] gr, y = self.merge_questions(gr) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)