Esempio n. 1
0
    def hal(self,
            wordset="verbs",
            zsaxes=(0, 1),
            rectify=False,
            basepath="/auto/k8/huth/storydata/story+books+wiki+15w-densehal-mat",
            debug=False):
        """HAL semantic model (without dimensionality reduction).
        """
        from text.story.util.HalModel import make_hal_wordset_model, verb_set, make_hal_sm, english1000
        haltf = tables.openFile(basepath + ".hf5")
        halmat = np.array(haltf.root.halmat.read())
        halvocab = cPickle.load(open(basepath + "-vocab"))

        ## Choose a wordset
        if wordset == "verbs":
            wordset = verb_set
        elif wordset == "cmuverbs":
            wordset = verb_set[:23]
        elif wordset == "english1000":
            wordset = english1000

        halsm = make_hal_sm(halmat, halvocab, wordset)

        for axis in zsaxes:
            halsm.zscore(axis)

        if rectify:
            halsm.rectify()

        halstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, halsm))
        #return mapdict(halstimseqs, lambda s: s.chunksums())
        if debug:
            return halstimseqs
        return self.downsample(halstimseqs)
Esempio n. 2
0
 def word2vec(self,
              modelfile="/auto/k8/huth/GoogleNews-vectors-negative300.bin",
              norm=False):
     """GenSim / word2vec model.
     """
     model = self.get_word2vec_model(modelfile, norm)
     #modeldims = model["test"].shape[0]
     #model.data = np.zeros((modeldims,))
     w2vstims = mapdict(self.wordseqs, lambda ds: makelsa(ds, model))
     return self.downsample(w2vstims)
Esempio n. 3
0
 def nmflsa(self):
     """NMF LSA model based on newLSA.
     """
     tf = tables.openFile("/auto/k6/huth/nmf-lsa.hf5")
     vocab = tf.root.vocab.read()
     data = tf.root.data.read()
     nmodel = SemanticModel(data, vocab)
     wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, nmodel))
     #return mapdict(wordstimseqs, lambda s: s.chunksums())
     return self.downsample(wordstimseqs)
Esempio n. 4
0
    def commonwords(
            self,
            num=100,
            basepath="/auto/k8/huth/storydata/stories-wbooks-lsa-2-vocab"):
        """Common word indicator model. Based on old LSA model fitting, used less data.
        """
        vocab = cPickle.load(open(basepath))
        counts = cPickle.load(open(basepath + "-Rcounts"))
        selwords = np.argsort(counts)[-num:]
        wmodel = SemanticModel(np.eye(num), list(np.array(vocab)[selwords]))

        wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel))
        #return mapdict(wordstimseqs, lambda s: s.chunksums())
        return self.downsample(wordstimseqs)
Esempio n. 5
0
 def co(self,
        wordset="english1000",
        zsaxes=(0, 1),
        rectify=False,
        basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat",
        debug=False):
     """Co-occurence-based semantic model (without dimensionality reduction).
     """
     cosm = self.get_co_model(wordset, zsaxes, rectify, basepath)
     costimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, cosm))
     #return mapdict(halstimseqs, lambda s: s.chunksums())
     if debug:
         return costimseqs
     return self.downsample(costimseqs)
Esempio n. 6
0
    def commonwords2(
            self,
            num=100,
            basepath="/auto/k8/huth/storydata/comodels/complete2-15w-denseco-mat"
    ):
        """Common word indicator model. Base on newer co model fitting, using more data.
        """
        cotf = tables.openFile(basepath + ".hf5")
        counts = cotf.root.wordcounts.read()
        covocab = cPickle.load(open(basepath + "-vocab"))
        selwords = np.argsort(counts)[-num:]
        wmodel = SemanticModel(np.eye(num), list(np.array(covocab)[selwords]))

        wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel))
        return self.downsample(wordstimseqs)
Esempio n. 7
0
    def allwords(self):
        """All word indicator model.
        """
        from text.textcore import Corpus
        corpus_file = "/auto/k5/huth/corpora/story/raw-transcripts/stories1.tar.gz"
        corpus = Corpus(corpus_file, split_documents=200)
        corpus_file1 = "/auto/k5/huth/corpora/story/raw-transcripts/stories2.tar.gz"
        corpus.append_corpus(corpus_file1)

        storyvocab = sorted(list(set(corpus.get_vocabulary())))
        num = len(storyvocab)
        wmodel = SemanticModel(np.eye(num), list(np.array(storyvocab)))

        wordstimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, wmodel))
        #return mapdict(wordstimseqs, lambda s: s.chunksums())
        return self.downsample(wordstimseqs)
Esempio n. 8
0
    def newlsa(self,
               ndim,
               rectify,
               entweight,
               entcutoff=5,
               basepath="/auto/k6/huth/lsamats6/",
               debug=False):
        """New LSA semantic model.
        """
        lsasm = self.get_newlsa_model(ndim, rectify, entweight, entcutoff,
                                      basepath)
        lsastimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, lsasm))

        if debug:
            return lsastimseqs
        return self.downsample(lsastimseqs)
Esempio n. 9
0
    def lsa(self,
            ndim,
            rectify,
            zsaxes=(1, ),
            basepath="/auto/k8/huth/storydata/stories-wbooks-lsa-2",
            debug=False):
        """LSA semantic model.
        """
        vocab = cPickle.load(open(basepath + "-vocab"))
        lsasm = SemanticModel(None, None)
        lsasm.load_ascii_root(basepath + "-Vt", vocab)
        lsasm.data = lsasm.data[:ndim]

        for axis in zsaxes:
            lsasm.zscore(axis)

        if rectify:
            lsasm.rectify()

        lsastimseqs = mapdict(self.wordseqs, lambda ds: makelsa(ds, lsasm))
        #return mapdict(lsastimseqs, lambda s: s.chunksums())
        if debug:
            return lsastimseqs
        return self.downsample(lsastimseqs)