class FeatureVectorizer(object): """ Extracts and calculates feature vectors for stories in the corpus. """ def __init__(self, ngram_range=(1, 1), with_tfidf=False, pca_comps=None): """ @param ngram_range - The lower and upper boundary of the range of n-values for different n-grams to be extracted (All values of n such that min_n <= n <= max_n will be used) @param with_tfidf - Whether to apply TF-IDF weightings to vector values @param pca_comps - # of PCA components (as defined in sklearn.decomposition.PCA) """ self.corpus_manager = CorpusManager() if with_tfidf: self.vectorizer = TfidfVectorizer(input='filename', ngram_range=ngram_range) else: self.vectorizer = CountVectorizer(input='filename', ngram_range=ngram_range) self.pca = PCA(n_components=pca_comps) def get_fpaths(self, subs): """ Returns the filepaths to the pre-processed versions of the stories in the given list of sub-corpora. @param subs - List of sub-corpora by their identifier @return List of filepaths to pre-processed story files """ subs = set(subs) fpaths = [] for sub, sids in sorted(self.corpus_manager.ids.items(), key=lambda i: i[0]): if sub in subs: for sid in sorted(list(sids)): fpaths.append(self.corpus_manager.get_fpath(sid, tpe='pre-processed')) return fpaths def vectorize(self, subs): """ Vectorizes (fit + tranform) the pre-processed texts for the stories in the specified sub-corpora, outputting the feature matrix. The rows of the feature matrix correspond to individual stories, ordered in alphabetical order of sub-corpus name and then story Id. @param subs - List of sub-corpora by identifier @return Feature matrix with rows corresponding to individual stories, ordered in alphabetical order of sub-corpus name and then story Id """ X = self.vectorizer.fit_transform(self.get_fpaths(subs)) return self.pca.fit_transform(X.toarray()) def transform(self, subs): """ Transforms the stories in the given sub-corpora according to the fitted vectorizer (Must have called FeatureVectorizer.vectorize at least once). @param subs - List of sub-corpora by identifier @return Feature matrix with rows corresponding to individual stories, ordered in alphabetical order of sub-corpus name and then story Id """ X = self.vectorizer.transform(self.get_fpaths(subs)) return self.pca.transform(X.toarray())