Esempio n. 1
0
 def _vectorize(self, X):
     """
     Vectorize the input X
     :param X: an iterable of documents
     :return: the sparse matrix representation of X
     """
     text_gen = add_negation((self._stem(text) for text in X))
     return self._vectorizer.transform(text_gen)
Esempio n. 2
0
 def _vectorize(self, X):
     """
     Vectorize the input X
     :param X: an iterable of documents
     :return: the sparse matrix representation of X
     """
     text_gen = add_negation((self._stem(text) for text in X))
     return self._vectorizer.transform(text_gen)
Esempio n. 3
0
 def fit(self, X, y):
     """
     Fit the model
     :param X: a list of texts
     :param y: a list of labels
     :return: self
     """
     text_gen = add_negation((self._stem(text) for text in X))
     if self._vocabulary:
         _X_train = self._vectorizer.transform(text_gen)
     else:
         _X_train = self._vectorizer.fit_transform(text_gen)
     self._classifier.fit(_X_train, y)
     return self
Esempio n. 4
0
 def fit(self, X, y):
     """
     Fit the model
     :param X: a list of texts
     :param y: a list of labels
     :return: self
     """
     text_gen = add_negation((self._stem(text) for text in X))
     if self._vocabulary:
         _X_train = self._vectorizer.transform(text_gen)
     else:
         _X_train = self._vectorizer.fit_transform(text_gen)
     self._classifier.fit(_X_train, y)
     return self
Esempio n. 5
0
def read_yelp_set(columns=None, data="training", stem=False, preprocess=False):
    """
    Read yelp data set into a Pandas DataFrame
    :param columns: list columns to return
    :param data: 'training' or 'test'
    :param stem: stem words in 'text' column if True
    :param preprocess:
    :return: pandas dataframe
    """
    if not data in ["training", "test"]:
        raise ValueError('data must be either "training" or "test"')
    if stem:
        prfix = "stemmed_"
    else:
        prfix = ""
    filename = pkg_resources.resource_filename(__name__, "data/%s%s_review.csv" % (prfix, data))
    df = pd.read_csv(filename, encoding="utf8").fillna(u"")
    if columns:
        df = df[columns]
    if preprocess:
        df.text = add_negation(map(unicode.lower, df.text))
    return df
Esempio n. 6
0
def read_yelp_set(columns=None, data='training', stem=False, preprocess=False):
    """
    Read yelp data set into a Pandas DataFrame
    :param columns: list columns to return
    :param data: 'training' or 'test'
    :param stem: stem words in 'text' column if True
    :param preprocess:
    :return: pandas dataframe
    """
    if not data in ['training', 'test']:
        raise ValueError('data must be either "training" or "test"')
    if stem:
        prfix = "stemmed_"
    else:
        prfix = ""
    filename = pkg_resources.resource_filename(
        __name__, "data/%s%s_review.csv" % (prfix, data))
    df = pd.read_csv(filename, encoding='utf8').fillna(u'')
    if columns:
        df = df[columns]
    if preprocess:
        df.text = add_negation(map(unicode.lower, df.text))
    return df