Ejemplo n.º 1
0
    def getAllBoW(self):
        """Return unigram, bigram words as bag of words."""

        fname = 'cache/word_ngrams.%s.set%d.dom%d.pickle' % (
                 self.getFilename(), self.getEssaySet(), self.getDomain())

        try:
            f = open(fname, 'rb')
            bow = pickle.load(f)
        except:
            bigram_measures = nltk.collocations.BigramAssocMeasures()

            bow = list()
            for line in self.getRawText():
                cur = LanguageUtils.tokenize(line)

                finder = BigramCollocationFinder.from_words(cur)
                scored = finder.score_ngrams(bigram_measures.pmi)
                for bigram, score in scored:
                    cur.append(bigram)

                bow.append(cur)

            pickle.dump(bow, open(fname, 'w'))

        return bow
Ejemplo n.º 2
0
    def extractFeatures(self, ds, corpus):
        """Extracts features from a DataSet ds"""

        # load into memory the string from data/essay_set_desc_?.txt
        f = open('data/essay_set_desc_%d.txt' % ds.getEssaySet(), 'r')
        prompt = f.read()

        # tokenize into unigrams & bigrams
        tokens = LanguageUtils.tokenize(prompt)

        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(tokens)
        scored = finder.score_ngrams(bigram_measures.raw_freq)
        for bigram, score in scored:
            tokens.append(bigram)

        # Get feature bows for projection into LSI
        dictionary = corpus.getWordDictionary()

        # get lsi
        lsi = corpus.getLSA()
        tfidf = corpus.getTfidf()
        mm_corpus = ds.getGensimCorpus()

        # project into lsi space
        vec_bow = dictionary.doc2bow(tokens)
        vec_lsi = lsi[tfidf[vec_bow]]

        index = gensim.similarities.MatrixSimilarity(lsi[tfidf[ds.getGensimCorpus()]])

        sims = index[vec_lsi]

        feats = list()
        for sim in sims:
            cur_feat = list()
            cur_feat.append(sim)
            feats.append(cur_feat)

        self.features = np.asarray(feats)
        return