def load_set(fname, emb, cache_dir=None): save_cache = False if cache_dir: fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5( fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True s0, s1, y, _, _, _ = loader.load_anssel(fname) e0, e1, s0, s1, y = loader.load_embedded(emb, s0, s1, y, balance=True, ndim=1) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((e0, e1, y), f) return (e0, e1, y)
def load_set(self, fname, cache_dir=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True skip_oneclass = self.c.get('skip_oneclass', True) s0, s1, y, kw, akw, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass) # TODO: Make use of the t-annotations if self.vocab is None: vocab = Vocabulary(s0 + s1) else: vocab = self.vocab si0 = vocab.vectorize(s0, spad=self.s0pad) si1 = vocab.vectorize(s1, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1, kw=kw, akw=akw) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad, cache_dir=None, skip_oneclass=True): """ Caching: If cache_dir is set: it tries to load finished dataset from it (filename of cache is hash of fname), and if that fails, it will compute dataset and try to save it.""" save_cache = False if cache_dir: fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache=True s0, s1, y, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass) # TODO: Make use of the t-annotations if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (s0, s1, y, vocab, gr)
def load_set(glove, fname, balance=False, subsample0=3): s0, s1, labels, toklabels = loader.load_anssel(fname, subsample0=subsample0) print('(%s) Loaded dataset: %d' % (fname, len(s0))) e0, e1, s0, s1, labels = loader.load_embedded(glove, s0, s1, labels, balance=balance) return ([e0, e1], labels)
def load_set(fname, vocab=None): s0, s1, y, _, _, _ = loader.load_anssel(fname) if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0) si1 = vocab.vectorize(s1) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1) return (s0, s1, y, vocab, gr)
def load_set(fname, vocab=None): s0, s1, y, t = loader.load_anssel(fname) if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0) si1 = vocab.vectorize(s1) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1) return (s0, s1, y, vocab, gr)
def load_set(fname, vocab=None): s0, s1, y, t = loader.load_anssel(fname, skip_oneclass=False) # s0=questions, s1=answers if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1) return s0, s1, y, vocab, gr
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad): s0, s1, y, t = loader.load_anssel(fname) # TODO: Make use of the t-annotations if vocab is None: vocab = Vocabulary(s0 + s1) si0 = vocab.vectorize(s0, spad=s0pad) si1 = vocab.vectorize(s1, spad=s1pad) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1) return (s0, s1, y, vocab, gr)
def load_set(self, fname, cache_dir=None): # TODO: Make the cache-handling generic, # and offer a way to actually pass cache_dir save_cache = False if cache_dir: import os.path fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % ( cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache = True skip_oneclass = self.c.get('skip_oneclass', True) s0, s1, y, kw, akw, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass) # TODO: Make use of the t-annotations if self.vocab is None: vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase']) else: vocab = self.vocab si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad) si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad) f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1, kw=kw, akw=akw) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((s0, s1, y, vocab, gr), f) print("save") return (gr, y, vocab)
def load_set(fname, emb, vocab=None): s0, s1, y, _, _, _ = loader.load_anssel(fname) if vocab is None: vocab = Vocabulary(s0 + s1) si0, sj0 = vocab.vectorize(s0, emb) si1, sj1 = vocab.vectorize(s1, emb) se0 = emb.map_jset(sj0) se1 = emb.map_jset(sj1) f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad) gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1) # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint # for the dataset. In KeraSTS, we solve this by using fit_generator (also # because of epoch_fract) and embed just per-batch. return (s0, s1, y, vocab, gr)
def load_set(fname, emb, cache_dir=None): save_cache = False if cache_dir: fname_abs = os.path.abspath(fname) from hashlib import md5 cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest()) try: with open(cache_filename, "rb") as f: return pickle.load(f) except (IOError, TypeError, KeyError): save_cache=True s0, s1, y, t = loader.load_anssel(fname) e0, e1, s0, s1, y = loader.load_embedded(emb, s0, s1, y, balance=True, ndim=1) if save_cache: with open(cache_filename, "wb") as f: pickle.dump((e0, e1, y), f) return (e0, e1, y)
def load_set(glove, fname, balance=False, subsample0=3): s0, s1, labels = loader.load_anssel(fname, subsample0=subsample0) print('(%s) Loaded dataset: %d' % (fname, len(s0))) e0, e1, s0, s1, labels = loader.load_embedded(glove, s0, s1, labels, balance=balance) return ([e0, e1], labels)