def getAllBoW(self): """Return unigram, bigram words as bag of words.""" fname = 'cache/word_ngrams.%s.set%d.dom%d.pickle' % ( self.getFilename(), self.getEssaySet(), self.getDomain()) try: f = open(fname, 'rb') bow = pickle.load(f) except: bigram_measures = nltk.collocations.BigramAssocMeasures() bow = list() for line in self.getRawText(): cur = LanguageUtils.tokenize(line) finder = BigramCollocationFinder.from_words(cur) scored = finder.score_ngrams(bigram_measures.pmi) for bigram, score in scored: cur.append(bigram) bow.append(cur) pickle.dump(bow, open(fname, 'w')) return bow
def extractFeatures(self, ds, corpus): """Extracts features from a DataSet ds""" # load into memory the string from data/essay_set_desc_?.txt f = open('data/essay_set_desc_%d.txt' % ds.getEssaySet(), 'r') prompt = f.read() # tokenize into unigrams & bigrams tokens = LanguageUtils.tokenize(prompt) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokens) scored = finder.score_ngrams(bigram_measures.raw_freq) for bigram, score in scored: tokens.append(bigram) # Get feature bows for projection into LSI dictionary = corpus.getWordDictionary() # get lsi lsi = corpus.getLSA() tfidf = corpus.getTfidf() mm_corpus = ds.getGensimCorpus() # project into lsi space vec_bow = dictionary.doc2bow(tokens) vec_lsi = lsi[tfidf[vec_bow]] index = gensim.similarities.MatrixSimilarity(lsi[tfidf[ds.getGensimCorpus()]]) sims = index[vec_lsi] feats = list() for sim in sims: cur_feat = list() cur_feat.append(sim) feats.append(cur_feat) self.features = np.asarray(feats) return