Ejemplo n.º 1
0
    def getAllBoW(self):
        """Return unigram, bigram words as bag of words."""

        fname = 'cache/word_ngrams.%s.set%d.dom%d.pickle' % (
                 self.getFilename(), self.getEssaySet(), self.getDomain())

        try:
            f = open(fname, 'rb')
            bow = pickle.load(f)
        except:
            bigram_measures = nltk.collocations.BigramAssocMeasures()

            bow = list()
            for line in self.getRawText():
                cur = LanguageUtils.tokenize(line)

                finder = BigramCollocationFinder.from_words(cur)
                scored = finder.score_ngrams(bigram_measures.pmi)
                for bigram, score in scored:
                    cur.append(bigram)

                bow.append(cur)

            pickle.dump(bow, open(fname, 'w'))

        return bow
Ejemplo n.º 2
0
    def getPOS(self):
        if len(self.pos_tags) > 0:
            return self.pos_tags

        fname = 'cache/pos.%s.set%d.pickle' % (self.file_name, self.essay_set)
        try:
            f = open(fname, 'rb')
            self.pos_tags = pickle.load(f)
        except:
            pos_lines = list()
            tot_ln = self.size()
            prog = 0
            hunpos = nltk.tag.HunposTagger("en_wsj.model")

            for line in self.getRawText():
                tokens = LanguageUtils.punkt_tokenize(line)
                pos_tags = hunpos.tag(tokens)
                tags_only = [tag for w, tag in pos_tags]
                pos_lines.append(tags_only)
                prog += 1
                if prog % 100 == 0:
                    print "POS Tagging %d of %d" % (prog, tot_ln)

                self.pos_tags = pos_lines

            f = open(fname, 'w')
            pickle.dump(self.pos_tags, f)

        return self.pos_tags
Ejemplo n.º 3
0
    def extractFeatures(self, ds, corpus):
        """Extracts features from a DataSet ds"""

        # load into memory the string from data/essay_set_desc_?.txt
        f = open('data/essay_set_desc_%d.txt' % ds.getEssaySet(), 'r')
        prompt = f.read()

        # tokenize into unigrams & bigrams
        tokens = LanguageUtils.tokenize(prompt)

        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(tokens)
        scored = finder.score_ngrams(bigram_measures.raw_freq)
        for bigram, score in scored:
            tokens.append(bigram)

        # Get feature bows for projection into LSI
        dictionary = corpus.getWordDictionary()

        # get lsi
        lsi = corpus.getLSA()
        tfidf = corpus.getTfidf()
        mm_corpus = ds.getGensimCorpus()

        # project into lsi space
        vec_bow = dictionary.doc2bow(tokens)
        vec_lsi = lsi[tfidf[vec_bow]]

        index = gensim.similarities.MatrixSimilarity(lsi[tfidf[ds.getGensimCorpus()]])

        sims = index[vec_lsi]

        feats = list()
        for sim in sims:
            cur_feat = list()
            cur_feat.append(sim)
            feats.append(cur_feat)

        self.features = np.asarray(feats)
        return