def _vectorize(self, X): """ Vectorize the input X :param X: an iterable of documents :return: the sparse matrix representation of X """ text_gen = add_negation((self._stem(text) for text in X)) return self._vectorizer.transform(text_gen)
def fit(self, X, y): """ Fit the model :param X: a list of texts :param y: a list of labels :return: self """ text_gen = add_negation((self._stem(text) for text in X)) if self._vocabulary: _X_train = self._vectorizer.transform(text_gen) else: _X_train = self._vectorizer.fit_transform(text_gen) self._classifier.fit(_X_train, y) return self
def read_yelp_set(columns=None, data="training", stem=False, preprocess=False): """ Read yelp data set into a Pandas DataFrame :param columns: list columns to return :param data: 'training' or 'test' :param stem: stem words in 'text' column if True :param preprocess: :return: pandas dataframe """ if not data in ["training", "test"]: raise ValueError('data must be either "training" or "test"') if stem: prfix = "stemmed_" else: prfix = "" filename = pkg_resources.resource_filename(__name__, "data/%s%s_review.csv" % (prfix, data)) df = pd.read_csv(filename, encoding="utf8").fillna(u"") if columns: df = df[columns] if preprocess: df.text = add_negation(map(unicode.lower, df.text)) return df
def read_yelp_set(columns=None, data='training', stem=False, preprocess=False): """ Read yelp data set into a Pandas DataFrame :param columns: list columns to return :param data: 'training' or 'test' :param stem: stem words in 'text' column if True :param preprocess: :return: pandas dataframe """ if not data in ['training', 'test']: raise ValueError('data must be either "training" or "test"') if stem: prfix = "stemmed_" else: prfix = "" filename = pkg_resources.resource_filename( __name__, "data/%s%s_review.csv" % (prfix, data)) df = pd.read_csv(filename, encoding='utf8').fillna(u'') if columns: df = df[columns] if preprocess: df.text = add_negation(map(unicode.lower, df.text)) return df