コード例 #1
0
        lenfeats = list()
        for line in ds.getRawText():
            curfeat = list()
            words = FeatureTransitions.word_splitter.tokenize(line) 
            words = [word.lower() for word in words]
            words = [word for word in words if word.isalpha()]
            word_set = set(words)
            word_set.discard('')
            unique_transitions_count = 0
            transitions_count = 0 
            
            for word in words:
                if word in FeatureTransitions.transitions:
                    transitions_count += 1
            
            for word in word_set:
                if word in FeatureTransitions.transitions:
                    unique_transitions_count += 1
            
            #curfeat.append(transitions_count)
            curfeat.append(unique_transitions_count)
            #curfeat.append(unique_transitions_count/len(words))
            
            lenfeats.append(curfeat)

        self.features = np.asarray(lenfeats)
        return

FeatureBase.register(FeatureTransitions)

コード例 #2
0
            misspelled_char_count_in_set = 0
            misspelled_short_word_count = 0

            for word in words:
                if not word in FeatureSpelling.dictionary:
                    misspelled_word_count += 1
                    if len(word) < 6:
                        misspelled_short_word_count += 1
                    misspelled_char_count += len(word)

            for word in word_set:
                if not word in FeatureSpelling.dictionary:
                    misspelled_words.add(word)
                    misspelled_char_count_in_set += len(word)
            
            curfeat.append(len(misspelled_words))
            curfeat.append(misspelled_short_word_count)
            #curfeat.append(len(misspelled_words)/len(word_set)) 
            #curfeat.append(misspelled_word_count)
            curfeat.append(misspelled_char_count / (misspelled_word_count+1))
            #curfeat.append(misspelled_word_count / len(words))
            #curfeat.append(misspelled_char_count_in_set / (len(word_set) + 1))
            
            lenfeats.append(curfeat)

        self.features = np.asarray(lenfeats)
        return

FeatureBase.register(FeatureSpelling)

コード例 #3
0
            # Keep a counter of how many added.
            # Only add if index < length(ds_train)
            sorted_inds = [i[0] for i in sorted(enumerate(sims), key=lambda x:x[1], reverse=True)]

            counts = dict()
            for grade in range(min(tgrades), max(tgrades)+1):
                counts[grade] = 0
                
            for i in range(params.NUM_NN):
                if i == 0 and ds.isTrainSet():
                    continue
                ind = sorted_inds[i]
                grade = tgrades[ind]
                counts[grade] += 1

            if ds.isTrainSet():
                ind = sorted_inds[params.NUM_NN]
                grade = tgrades[ind]
                counts[grade] += 1

            for key in counts:
                cur_feat.append(counts[key])

            feats.append(cur_feat)

        self.features = np.asarray(feats)
        return

FeatureBase.register(FeatureNN)

コード例 #4
0
        unigram_scored_fname = 'cache/pos_unigram_corpus_scored.%s.set%d.pickle' % (ds.getID(), ds.getEssaySet())
        try:
            f = open(unigram_scored_fname, 'rb')
            print "Found Pickled <POS Unigram Corpus Scores>. Loading..."
            scored = pickle.load(f)
        except:
            freqs = nltk.FreqDist(x for x in corpus.getPOS()) # should we preserve case?
            scored = freqs.items()
            scored = [(word, score) for (word, score) in scored if not word in nltk.corpus.stopwords.words('english')]
            pickle.dump(scored, open(unigram_scored_fname, 'wb'))

        scored = scored[0:params.TOTAL_POS_UNIGRAMS]
        #print "Unigram features: "
        #print scored

        feats = list()
        for line in ds.getPOS():
            cur_feats = list()
            for pos, score in scored:
                cur_feats.append(line.count(pos))

            feats.append(cur_feats)

        #print feats

        self.features = np.asarray(feats)
        return

FeatureBase.register(FeaturePOSUnigram)

コード例 #5
0
        for bigram, score in scored:
            tokens.append(bigram)

        # Get feature bows for projection into LSI
        dictionary = corpus.getWordDictionary()

        # get lsi
        lsi = corpus.getLSA()
        tfidf = corpus.getTfidf()
        mm_corpus = ds.getGensimCorpus()

        # project into lsi space
        vec_bow = dictionary.doc2bow(tokens)
        vec_lsi = lsi[tfidf[vec_bow]]

        index = gensim.similarities.MatrixSimilarity(lsi[tfidf[ds.getGensimCorpus()]])

        sims = index[vec_lsi]

        feats = list()
        for sim in sims:
            cur_feat = list()
            cur_feat.append(sim)
            feats.append(cur_feat)

        self.features = np.asarray(feats)
        return

FeatureBase.register(FeaturePrompt)

コード例 #6
0
    def extractFeatures(self, ds, corpus):
        """Extracts features from a DataSet ds"""

        feats = list()
        lsi = corpus.getLSA()
        tfidf = corpus.getTfidf()

        index = gensim.similarities.MatrixSimilarity(lsi[tfidf[ds.getGensimCorpus()]])
        for mm in ds.getGensimCorpus():
            cur_feat = list()
            vec_lsi = lsi[tfidf[mm]]

            sims = index[vec_lsi]

            # AGGREGATE SIMILARITY
            sims = index[vec_lsi]
            sims = np.asarray(sims)
            #cur_feat.append(np.mean(sims))
            #cur_feat.append(np.var(sims))
            cur_feat.append(np.mean(sims))
            #cur_feat.append(np.median(sims))

            feats.append(cur_feat)

        self.features = np.asarray(feats)
        return

FeatureBase.register(FeatureSim)

コード例 #7
0
    def extractFeatures(self, ds, corpus):
        """Extracts features from a DataSet ds"""

        feats = list()
        lsi = corpus.getLSA()
        tfidf = corpus.getTfidf()
        text = ds.getRawText()
        i=0
        for mm in ds.getGensimCorpus():
            cur_feat = list()
            if len(mm) == 0:
                if params.DEBUG:
                    print "WARNING: No LSI features for current feature. Using (0,1) - which may be a horrible assumption."
                mm = [(0,1)]

            for topic, score in lsi[tfidf[mm]]:
                cur_feat.append(score)

            if len(cur_feat) != params.LSI_TOPICS:
                print "NON-MATCHING FEATURE LENGTH...LSI"
                import pdb;pdb.set_trace()
            feats.append(cur_feat)
            i+=1

        self.features = np.asarray(feats)
        return

FeatureBase.register(FeatureLSI)

コード例 #8
0
        return self.features

    def extractFeatures(self, ds, corpus):
        """Extracts features from a DataSet ds"""

        feats = list()
        lsi = corpus.getPOS_LSA()
        tfidf = corpus.getPOS_Tfidf()
        for mm in ds.getGensimPOSCorpus():
            cur_feat = list()

            if len(mm) == 0:
                if params.DEBUG:
                    print "WARNING: No LSI features for current feature. Using (0,1) - which may be a horrible assumption."
                mm = [(0,1)]

            for topic, score in lsi[tfidf[mm]]:
                cur_feat.append(score)

            if len(cur_feat) != params.POS_LSI_TOPICS:
                print "NON-MATCHING FEATURE LENGTH...POS LSI"
                import pdb;pdb.set_trace()

            feats.append(cur_feat)

        self.features = np.asarray(feats)
        return

FeatureBase.register(FeaturePOS_LSI)