def getAllBoW(self): """Return unigram, bigram words as bag of words.""" fname = 'cache/word_ngrams.%s.set%d.dom%d.pickle' % ( self.getFilename(), self.getEssaySet(), self.getDomain()) try: f = open(fname, 'rb') bow = pickle.load(f) except: bigram_measures = nltk.collocations.BigramAssocMeasures() bow = list() for line in self.getRawText(): cur = LanguageUtils.tokenize(line) finder = BigramCollocationFinder.from_words(cur) scored = finder.score_ngrams(bigram_measures.pmi) for bigram, score in scored: cur.append(bigram) bow.append(cur) pickle.dump(bow, open(fname, 'w')) return bow
def getPOS(self): if len(self.pos_tags) > 0: return self.pos_tags fname = 'cache/pos.%s.set%d.pickle' % (self.file_name, self.essay_set) try: f = open(fname, 'rb') self.pos_tags = pickle.load(f) except: pos_lines = list() tot_ln = self.size() prog = 0 hunpos = nltk.tag.HunposTagger("en_wsj.model") for line in self.getRawText(): tokens = LanguageUtils.punkt_tokenize(line) pos_tags = hunpos.tag(tokens) tags_only = [tag for w, tag in pos_tags] pos_lines.append(tags_only) prog += 1 if prog % 100 == 0: print "POS Tagging %d of %d" % (prog, tot_ln) self.pos_tags = pos_lines f = open(fname, 'w') pickle.dump(self.pos_tags, f) return self.pos_tags
def extractFeatures(self, ds, corpus): """Extracts features from a DataSet ds""" # load into memory the string from data/essay_set_desc_?.txt f = open('data/essay_set_desc_%d.txt' % ds.getEssaySet(), 'r') prompt = f.read() # tokenize into unigrams & bigrams tokens = LanguageUtils.tokenize(prompt) bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(tokens) scored = finder.score_ngrams(bigram_measures.raw_freq) for bigram, score in scored: tokens.append(bigram) # Get feature bows for projection into LSI dictionary = corpus.getWordDictionary() # get lsi lsi = corpus.getLSA() tfidf = corpus.getTfidf() mm_corpus = ds.getGensimCorpus() # project into lsi space vec_bow = dictionary.doc2bow(tokens) vec_lsi = lsi[tfidf[vec_bow]] index = gensim.similarities.MatrixSimilarity(lsi[tfidf[ds.getGensimCorpus()]]) sims = index[vec_lsi] feats = list() for sim in sims: cur_feat = list() cur_feat.append(sim) feats.append(cur_feat) self.features = np.asarray(feats) return