Ejemplo n.º 1
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        s0, s1, y = loader.load_hypev(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)  # FIXME: lower?
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)
        gr, y = self.merge_questions(gr)
        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Ejemplo n.º 2
0
def load_set(fname, vocab):
    s0, s1, y = loader.load_snli(fname, vocab)
    si0 = vocab.vectorize(s0, spad=spad)
    si1 = vocab.vectorize(s1, spad=spad)
    f0, f1 = nlp.sentence_flags(s0, s1, spad, spad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)
    return (si0, si1, y, gr)
Ejemplo n.º 3
0
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad, cache_dir=None, skip_oneclass=True):
    """ Caching: If cache_dir is set: it tries to load finished dataset from it 
        (filename of cache is hash of fname), and if that fails, it will compute 
        dataset and try to save it."""
    save_cache = False
    if cache_dir:
        fname_abs = os.path.abspath(fname)
        from hashlib import md5
        cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())

        try:
            with open(cache_filename, "rb") as f:
                return pickle.load(f)
        except (IOError, TypeError, KeyError):
            save_cache=True

    s0, s1, y, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass)
    # TODO: Make use of the t-annotations

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    if save_cache:
        with open(cache_filename, "wb") as f:
            pickle.dump((s0, s1, y, vocab, gr), f)
            print("save")

    return (s0, s1, y, vocab, gr)
Ejemplo n.º 4
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        skip_oneclass = self.c.get('skip_oneclass', True)
        s0, s1, y, kw, akw, t = loader.load_anssel(fname, skip_oneclass=skip_oneclass)
        # TODO: Make use of the t-annotations

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1, kw=kw, akw=akw)

        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Ejemplo n.º 5
0
def load_set(fname, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0)
    si1 = vocab.vectorize(s1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return (s0, s1, y, vocab, gr)
Ejemplo n.º 6
0
def load_set(fname, vocab=None):
    s0, s1, y, t = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0)
    si1 = vocab.vectorize(s1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return (s0, s1, y, vocab, gr)
Ejemplo n.º 7
0
def load_set(fname, vocab=None):
    s0, s1, y = loader.load_hypev(fname)
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    return s0, s1, y, vocab, gr
Ejemplo n.º 8
0
def load_sent(q, a, vocab=None):
    s0, s1, y = [q], [a], 1
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1)

    return gr
Ejemplo n.º 9
0
def load_set(fname, vocab=None):
    s0, s1, y = loader.load_hypev(fname)
    # s0=questions, s1=answers

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    return s0, s1, y, vocab, gr
Ejemplo n.º 10
0
def load_set(fname, vocab=None, s0pad=s0pad, s1pad=s1pad):
    s0, s1, y, t = loader.load_anssel(fname)
    # TODO: Make use of the t-annotations

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0 = vocab.vectorize(s0, spad=s0pad)
    si1 = vocab.vectorize(s1, spad=s1pad)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

    return (s0, s1, y, vocab, gr)
Ejemplo n.º 11
0
    def load_set(self, fname, cache_dir=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        skip_oneclass = self.c.get('skip_oneclass', True)
        s0, s1, y, kw, akw, t = loader.load_anssel(fname,
                                                   skip_oneclass=skip_oneclass)
        # TODO: Make use of the t-annotations

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0,
                                si1,
                                sj0,
                                sj1,
                                None,
                                None,
                                y,
                                f0,
                                f1,
                                s0,
                                s1,
                                kw=kw,
                                akw=akw)

        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Ejemplo n.º 12
0
    def load_set(self, fname):
        s0, s1, y = loader.load_msrpara(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
Ejemplo n.º 13
0
    def load_set(self, fname, cache_dir=None):
        s0, s1, y = loader.load_hypev(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1)
        else:
            vocab = self.vocab

        si0 = vocab.vectorize(s0, spad=self.s0pad)
        si1 = vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, y, f0, f1, s0, s1)

        return (gr, y, vocab)
Ejemplo n.º 14
0
    def load_set(self, fname):
        s0, s1, y = loader.load_sick2014(fname, mode='entailment')

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1)

        return (gr, y, vocab)
Ejemplo n.º 15
0
    def links_to_graph(self, links):
        s0 = []
        s1 = []
        labels = []
        for link in links:
            s0l, s1l, labelsl = self.link_to_s(link)
            s0 += s0l
            s1 += s1l
            labels += labelsl

        si0 = self.vocab.vectorize(s0, spad=self.s0pad)
        si1 = self.vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)

        gr = graph_input_anssel(si0, si1, np.array(labels), f0, f1)
        return gr
Ejemplo n.º 16
0
    def links_to_graph(self, links):
        s0 = []
        s1 = []
        labels = []
        for link in links:
            s0l, s1l, labelsl = self.link_to_s(link)
            s0 += s0l
            s1 += s1l
            labels += labelsl

        si0 = self.vocab.vectorize(s0, spad=self.s0pad)
        si1 = self.vocab.vectorize(s1, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)

        gr = graph_input_anssel(si0, si1, np.array(labels), f0, f1)
        return gr
Ejemplo n.º 17
0
    def load_set(self, fname):
        s0, s1, y = loader.load_sick2014(fname, mode='entailment')

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0,
                                s1)

        return (gr, y, vocab)
Ejemplo n.º 18
0
def load_set(fname, emb, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0, sj0 = vocab.vectorize(s0, emb)
    si1, sj1 = vocab.vectorize(s1, emb)
    se0 = emb.map_jset(sj0)
    se1 = emb.map_jset(sj1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1)

    # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint
    # for the dataset.  In KeraSTS, we solve this by using fit_generator (also
    # because of epoch_fract) and embed just per-batch.

    return (s0, s1, y, vocab, gr)
Ejemplo n.º 19
0
def load_set(fname, emb, vocab=None):
    s0, s1, y, _, _, _ = loader.load_anssel(fname)

    if vocab is None:
        vocab = Vocabulary(s0 + s1)

    si0, sj0 = vocab.vectorize(s0, emb)
    si1, sj1 = vocab.vectorize(s1, emb)
    se0 = emb.map_jset(sj0)
    se1 = emb.map_jset(sj1)
    f0, f1 = nlp.sentence_flags(s0, s1, s0pad, s1pad)
    gr = graph_input_anssel(si0, si1, sj0, sj1, se0, se1, y, f0, f1)

    # XXX: Pre-generating the whole (se0, se1) produces a *big* memory footprint
    # for the dataset.  In KeraSTS, we solve this by using fit_generator (also
    # because of epoch_fract) and embed just per-batch.

    return (s0, s1, y, vocab, gr)
Ejemplo n.º 20
0
    def load_set(self, fname, lists=None):
        if lists:
            s0, s1, y = lists
        else:
            #            s0, s1, y = loader.load_msrpara(fname)   #set it free is we decide not to use quora dataset
            s0, s1, y = loader.load_quora(fname)

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_nparray_anssel(
            graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1))

        return (gr, y, vocab)
Ejemplo n.º 21
0
 def load_set(self, fname, cache_dir=None):
     si0, si1, sj0, sj1, f0, f1, labels = cPickle.load(open(fname, "rb"))
     gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, labels, f0, f1)
     return (gr, labels, self.vocab)
Ejemplo n.º 22
0
 def load_set(self, fname):
     si0, si1, sj0, sj1, f0_, f1_, labels = pickle.load(open(fname, "rb"))
     gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, labels, f0_, f1_)
     return (gr, labels, self.vocab)
Ejemplo n.º 23
0
 def load_set(self, fname):
     si0, si1, sj0, sj1, f0_, f1_, labels = pickle.load(open(fname, "rb"))
     gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, labels, f0_,
                             f1_)
     return (gr, labels, self.vocab)
Ejemplo n.º 24
0
    def load_set(self, fname, cache_dir=None, lists=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        if lists is not None:
            s0, s1, y, qids, xtra, types = lists
        else:
            xtra = None
            if '/mc' in fname:
                s0, s1, y, qids, types = loader.load_mctest(fname)
            else:
                s0, s1, y, qids = loader.load_hypev(fname)
                try:
                    dsfile = re.sub('\.([^.]*)$', '_aux.tsv', fname)  # train.tsv -> train_aux.tsv
                    with open(dsfile) as f:
                        rows = csv.DictReader(f, delimiter='\t')
                        xtra = loader.load_hypev_xtra(rows)
                        print(dsfile + ' loaded and available')
                except Exception as e:
                    if self.c['aux_r'] or self.c['aux_c']:
                        raise e
                types = None

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1, prune_N=self.c['embprune'], icase=self.c['embicase'])
        else:
            vocab = self.vocab

        # mcqtypes pruning must happen *after* Vocabulary has been constructed!
        if types is not None:
            s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']]
            s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']]
            y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']]
            qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']]
            print('Retained %d questions, %d hypotheses (%s types)' % (len(set(qids)), len(set([' '.join(s) for s in s0])), self.c['mcqtypes']))

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0, s1)
        if qids is not None:
            gr['qids'] = qids
        if xtra is not None:
            gr['#'] = xtra['#']
            gr['@'] = xtra['@']
        gr, y = self.merge_questions(gr)
        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)
Ejemplo n.º 25
0
 def load_set(self, fname):
     si0, si1, f0, f1, labels = pickle.load(open(fname, "rb"))
     gr = graph_input_anssel(si0, si1, labels, f0, f1)
     return (gr, labels, self.vocab)
Ejemplo n.º 26
0
 def load_set(self, fname):
     si0, si1, f0, f1, labels = pickle.load(open(fname, "rb"))
     gr = graph_input_anssel(si0, si1, labels, f0, f1)
     return (gr, labels, self.vocab)
Ejemplo n.º 27
0
def load_set(fname, vocab):
    si0, si1, f0, f1, labels = cPickle.load(open(fname, "rb"))
    gr = graph_input_anssel(si0, si1, labels, f0, f1)
    return gr
Ejemplo n.º 28
0
    def load_set(self, fname, cache_dir=None, lists=None):
        # TODO: Make the cache-handling generic,
        # and offer a way to actually pass cache_dir
        save_cache = False
        if cache_dir:
            import os.path
            fname_abs = os.path.abspath(fname)
            from hashlib import md5
            cache_filename = "%s/%s.p" % (
                cache_dir, md5(fname_abs.encode("utf-8")).hexdigest())
            try:
                with open(cache_filename, "rb") as f:
                    return pickle.load(f)
            except (IOError, TypeError, KeyError):
                save_cache = True

        if lists is not None:
            s0, s1, y, qids, xtra, types = lists
        else:
            xtra = None
            if '/mc' in fname:
                s0, s1, y, qids, types = loader.load_mctest(fname)
            else:
                s0, s1, y, qids = loader.load_hypev(fname)
                try:
                    dsfile = re.sub('\.([^.]*)$', '_aux.tsv',
                                    fname)  # train.tsv -> train_aux.tsv
                    with open(dsfile) as f:
                        rows = csv.DictReader(f, delimiter='\t')
                        xtra = loader.load_hypev_xtra(rows)
                        print(dsfile + ' loaded and available')
                except Exception as e:
                    if self.c['aux_r'] or self.c['aux_c']:
                        raise e
                types = None

        if self.vocab is None:
            vocab = Vocabulary(s0 + s1,
                               prune_N=self.c['embprune'],
                               icase=self.c['embicase'])
        else:
            vocab = self.vocab

        # mcqtypes pruning must happen *after* Vocabulary has been constructed!
        if types is not None:
            s0 = [x for x, t in zip(s0, types) if t in self.c['mcqtypes']]
            s1 = [x for x, t in zip(s1, types) if t in self.c['mcqtypes']]
            y = [x for x, t in zip(y, types) if t in self.c['mcqtypes']]
            qids = [x for x, t in zip(qids, types) if t in self.c['mcqtypes']]
            print(
                'Retained %d questions, %d hypotheses (%s types)' %
                (len(set(qids)), len(set([' '.join(s)
                                          for s in s0])), self.c['mcqtypes']))

        si0, sj0 = vocab.vectorize(s0, self.emb, spad=self.s0pad)
        si1, sj1 = vocab.vectorize(s1, self.emb, spad=self.s1pad)
        f0, f1 = nlp.sentence_flags(s0, s1, self.s0pad, self.s1pad)
        gr = graph_input_anssel(si0, si1, sj0, sj1, None, None, y, f0, f1, s0,
                                s1)
        if qids is not None:
            gr['qids'] = qids
        if xtra is not None:
            gr['#'] = xtra['#']
            gr['@'] = xtra['@']
        gr, y = self.merge_questions(gr)
        if save_cache:
            with open(cache_filename, "wb") as f:
                pickle.dump((s0, s1, y, vocab, gr), f)
                print("save")

        return (gr, y, vocab)