lenfeats = list() for line in ds.getRawText(): curfeat = list() words = FeatureTransitions.word_splitter.tokenize(line) words = [word.lower() for word in words] words = [word for word in words if word.isalpha()] word_set = set(words) word_set.discard('') unique_transitions_count = 0 transitions_count = 0 for word in words: if word in FeatureTransitions.transitions: transitions_count += 1 for word in word_set: if word in FeatureTransitions.transitions: unique_transitions_count += 1 #curfeat.append(transitions_count) curfeat.append(unique_transitions_count) #curfeat.append(unique_transitions_count/len(words)) lenfeats.append(curfeat) self.features = np.asarray(lenfeats) return FeatureBase.register(FeatureTransitions)
misspelled_char_count_in_set = 0 misspelled_short_word_count = 0 for word in words: if not word in FeatureSpelling.dictionary: misspelled_word_count += 1 if len(word) < 6: misspelled_short_word_count += 1 misspelled_char_count += len(word) for word in word_set: if not word in FeatureSpelling.dictionary: misspelled_words.add(word) misspelled_char_count_in_set += len(word) curfeat.append(len(misspelled_words)) curfeat.append(misspelled_short_word_count) #curfeat.append(len(misspelled_words)/len(word_set)) #curfeat.append(misspelled_word_count) curfeat.append(misspelled_char_count / (misspelled_word_count+1)) #curfeat.append(misspelled_word_count / len(words)) #curfeat.append(misspelled_char_count_in_set / (len(word_set) + 1)) lenfeats.append(curfeat) self.features = np.asarray(lenfeats) return FeatureBase.register(FeatureSpelling)
# Keep a counter of how many added. # Only add if index < length(ds_train) sorted_inds = [i[0] for i in sorted(enumerate(sims), key=lambda x:x[1], reverse=True)] counts = dict() for grade in range(min(tgrades), max(tgrades)+1): counts[grade] = 0 for i in range(params.NUM_NN): if i == 0 and ds.isTrainSet(): continue ind = sorted_inds[i] grade = tgrades[ind] counts[grade] += 1 if ds.isTrainSet(): ind = sorted_inds[params.NUM_NN] grade = tgrades[ind] counts[grade] += 1 for key in counts: cur_feat.append(counts[key]) feats.append(cur_feat) self.features = np.asarray(feats) return FeatureBase.register(FeatureNN)
unigram_scored_fname = 'cache/pos_unigram_corpus_scored.%s.set%d.pickle' % (ds.getID(), ds.getEssaySet()) try: f = open(unigram_scored_fname, 'rb') print "Found Pickled <POS Unigram Corpus Scores>. Loading..." scored = pickle.load(f) except: freqs = nltk.FreqDist(x for x in corpus.getPOS()) # should we preserve case? scored = freqs.items() scored = [(word, score) for (word, score) in scored if not word in nltk.corpus.stopwords.words('english')] pickle.dump(scored, open(unigram_scored_fname, 'wb')) scored = scored[0:params.TOTAL_POS_UNIGRAMS] #print "Unigram features: " #print scored feats = list() for line in ds.getPOS(): cur_feats = list() for pos, score in scored: cur_feats.append(line.count(pos)) feats.append(cur_feats) #print feats self.features = np.asarray(feats) return FeatureBase.register(FeaturePOSUnigram)
for bigram, score in scored: tokens.append(bigram) # Get feature bows for projection into LSI dictionary = corpus.getWordDictionary() # get lsi lsi = corpus.getLSA() tfidf = corpus.getTfidf() mm_corpus = ds.getGensimCorpus() # project into lsi space vec_bow = dictionary.doc2bow(tokens) vec_lsi = lsi[tfidf[vec_bow]] index = gensim.similarities.MatrixSimilarity(lsi[tfidf[ds.getGensimCorpus()]]) sims = index[vec_lsi] feats = list() for sim in sims: cur_feat = list() cur_feat.append(sim) feats.append(cur_feat) self.features = np.asarray(feats) return FeatureBase.register(FeaturePrompt)
def extractFeatures(self, ds, corpus): """Extracts features from a DataSet ds""" feats = list() lsi = corpus.getLSA() tfidf = corpus.getTfidf() index = gensim.similarities.MatrixSimilarity(lsi[tfidf[ds.getGensimCorpus()]]) for mm in ds.getGensimCorpus(): cur_feat = list() vec_lsi = lsi[tfidf[mm]] sims = index[vec_lsi] # AGGREGATE SIMILARITY sims = index[vec_lsi] sims = np.asarray(sims) #cur_feat.append(np.mean(sims)) #cur_feat.append(np.var(sims)) cur_feat.append(np.mean(sims)) #cur_feat.append(np.median(sims)) feats.append(cur_feat) self.features = np.asarray(feats) return FeatureBase.register(FeatureSim)
def extractFeatures(self, ds, corpus): """Extracts features from a DataSet ds""" feats = list() lsi = corpus.getLSA() tfidf = corpus.getTfidf() text = ds.getRawText() i=0 for mm in ds.getGensimCorpus(): cur_feat = list() if len(mm) == 0: if params.DEBUG: print "WARNING: No LSI features for current feature. Using (0,1) - which may be a horrible assumption." mm = [(0,1)] for topic, score in lsi[tfidf[mm]]: cur_feat.append(score) if len(cur_feat) != params.LSI_TOPICS: print "NON-MATCHING FEATURE LENGTH...LSI" import pdb;pdb.set_trace() feats.append(cur_feat) i+=1 self.features = np.asarray(feats) return FeatureBase.register(FeatureLSI)
return self.features def extractFeatures(self, ds, corpus): """Extracts features from a DataSet ds""" feats = list() lsi = corpus.getPOS_LSA() tfidf = corpus.getPOS_Tfidf() for mm in ds.getGensimPOSCorpus(): cur_feat = list() if len(mm) == 0: if params.DEBUG: print "WARNING: No LSI features for current feature. Using (0,1) - which may be a horrible assumption." mm = [(0,1)] for topic, score in lsi[tfidf[mm]]: cur_feat.append(score) if len(cur_feat) != params.POS_LSI_TOPICS: print "NON-MATCHING FEATURE LENGTH...POS LSI" import pdb;pdb.set_trace() feats.append(cur_feat) self.features = np.asarray(feats) return FeatureBase.register(FeaturePOS_LSI)