コード例 #1
0
class FeatureVectorizer(object):
	"""
	Extracts and calculates feature vectors for stories in the corpus.
	"""

	def __init__(self, ngram_range=(1, 1), with_tfidf=False, pca_comps=None):
		"""
		@param ngram_range - The lower and upper boundary of the range of
			n-values for different n-grams to be extracted (All values of n such
			that min_n <= n <= max_n will be used)
		@param with_tfidf - Whether to apply TF-IDF weightings to vector values
		@param pca_comps - # of PCA components (as defined in
			sklearn.decomposition.PCA)
		"""

		self.corpus_manager = CorpusManager()

		if with_tfidf:
			self.vectorizer = TfidfVectorizer(input='filename',
				ngram_range=ngram_range)
		else:
			self.vectorizer = CountVectorizer(input='filename',
				ngram_range=ngram_range)

		self.pca = PCA(n_components=pca_comps)

	def get_fpaths(self, subs):
		"""
		Returns the filepaths to the pre-processed versions of the stories in
		the given list of sub-corpora.

		@param subs - List of sub-corpora by their identifier
		@return List of filepaths to pre-processed story files
		"""

		subs = set(subs)

		fpaths = []
		for sub, sids in sorted(self.corpus_manager.ids.items(),
			key=lambda i: i[0]):
			if sub in subs:
				for sid in sorted(list(sids)):
					fpaths.append(self.corpus_manager.get_fpath(sid,
						tpe='pre-processed'))

		return fpaths

	def vectorize(self, subs):
		"""
		Vectorizes (fit + tranform) the pre-processed texts for the stories in
		the specified sub-corpora, outputting the feature matrix.

		The rows of the feature matrix correspond to individual stories, ordered
		in alphabetical order of sub-corpus name and then story Id.

		@param subs - List of sub-corpora by identifier
		@return Feature matrix with rows corresponding to individual stories,
			ordered in alphabetical order of sub-corpus name and then story Id
		"""

		X = self.vectorizer.fit_transform(self.get_fpaths(subs))
		return self.pca.fit_transform(X.toarray())

	def transform(self, subs):
		"""
		Transforms the stories in the given sub-corpora according to the fitted
		vectorizer (Must have called FeatureVectorizer.vectorize at least once).

		@param subs - List of sub-corpora by identifier
		@return Feature matrix with rows corresponding to individual stories,
			ordered in alphabetical order of sub-corpus name and then story Id
		"""

		X = self.vectorizer.transform(self.get_fpaths(subs))
		return self.pca.transform(X.toarray())