Example #1
0
 def set_params(self, **kwargs):
     CountVectorizer.set_params(self, **kwargs)
     CountVectorizer.__init__(self,
                              preprocessor=get_preprocessor(
                                  self.column, self.size, self.terminator),
                              ngram_range=(1, self.size),
                              analyzer='char',
                              binary=self.binary)
Example #2
0
 def set_params(self, **kwargs):
     CountVectorizer.set_params(self, **kwargs)
     CountVectorizer.__init__(self,
                              preprocessor=get_preprocessor(self.column,
                                                            self.size,
                                                            self.terminator),
                              ngram_range=(1, self.size),
                              analyzer='char',
                              binary=self.binary)
Example #3
0
    def __init__(self, lang, **kwargs):
        CountVectorizer.__init__(self, kwargs)

        try:
            self.stemmer = SnowballStemmer(lang.lower()).stem
            self.vect = CountVectorizer()
            self.analyzer = self.analyzer_nltk
        except ValueError:
            pass
 def __init__(self, large_file=False):
     if large_file:
         HashingVectorizer.__init__(self)
     else:
         # Over ride the built in string processing by assigning the
         # tokenizer, preprocessor and lowercase parameters as below.
         CountVectorizer.__init__(self, tokenizer=identity,
                                  preprocessor=None,
                                  lowercase=False)
Example #5
0
 def __init__(self, column, binary=False, size=3, terminator='$'):
     self.column = column
     self.size = size
     self.terminator = terminator
     CountVectorizer.__init__(self,
                              preprocessor=get_preprocessor(
                                  self.column, self.size, self.terminator),
                              ngram_range=(1, size),
                              analyzer='char',
                              binary=binary)
Example #6
0
 def __init__(self, column, binary=False, size=3, terminator='$'):
     self.column = column
     self.size = size
     self.terminator = terminator
     CountVectorizer.__init__(self,
                              preprocessor=get_preprocessor(self.column,
                                                            self.size,
                                                            self.terminator),
                              ngram_range=(1, size),
                              analyzer='char',
                              binary=binary)
Example #7
0
    def __init__(
            self,
            lowercase: Boolean(),
            stopwords_remove: Boolean(),
            binary: Boolean(),
            inner_tokenizer: algorithm(Sentence(), List(Word())),
            inner_stemmer: algorithm(Word(), Stem()),
            inner_stopwords: algorithm(List(Word()), List(Word())),
    ):
        self.stopwords_remove = stopwords_remove
        self.inner_tokenizer = inner_tokenizer
        self.inner_stemmer = inner_stemmer
        self.inner_stopwords = inner_stopwords

        SklearnTransformer.__init__(self)
        _CountVectorizer.__init__(self, lowercase=lowercase, binary=binary)
Example #8
0
    def __init__(self,
                 input="content",
                 encoding="utf-8",
                 decode_error="strict",
                 strip_accents=None,
                 lowercase=True,
                 preprocessor=None,
                 tokenizer=None,
                 stop_words=None,
                 token_pattern="(?u)\b\w\w+\b",
                 ngram_range=(1, 1),
                 analyzer="word",
                 max_df=1.0,
                 min_df=1,
                 max_features=None,
                 vocabulary=None,
                 binary=False,
                 dtype=numpy.int64,
                 progress_bar_resolution_seconds=.333,
                 progress_bar_clear_when_done=False):

        CountVectorizer.__init__(self,
                                 input=input,
                                 encoding=encoding,
                                 decode_error=decode_error,
                                 strip_accents=strip_accents,
                                 lowercase=lowercase,
                                 preprocessor=preprocessor,
                                 tokenizer=tokenizer,
                                 stop_words=stop_words,
                                 token_pattern=token_pattern,
                                 ngram_range=ngram_range,
                                 analyzer=analyzer,
                                 max_df=max_df,
                                 min_df=min_df,
                                 max_features=max_features,
                                 vocabulary=vocabulary,
                                 binary=binary,
                                 dtype=dtype)

        ProgressBarVectorizer.__init__(self, progress_bar_resolution_seconds,
                                       progress_bar_clear_when_done)
Example #9
0
 def __init__(self, stopwords_list=None, max_features=None):
     CountVectorizer.__init__(self,analyzer="word",\
                              strip_accents="unicode",\
                              stop_words=stopwords_list,\
                              max_features=max_features)
     self.en_lemmatizer = nltk.stem.WordNetLemmatizer()
Example #10
0
	def __init__(self, analyzer=BOWAnalyzer, max_df=None):
		CountVectorizer.__init__(self, analyzer=analyzer, max_df=max_df)
Example #11
0
 def __init__(self,
              n_grams=1,
              first_last_sentence_only=False):
     CountVectorizer.__init__(self, ngram_range=(n_grams, n_grams))
     self.first_last_sentence_only = first_last_sentence_only
     self.term_dict = {}
Example #12
0
 def __init__(self, **kwargs):
     CountVectorizer.__init__(self, **kwargs)
 def __init__(self):
     CountVectorizer.__init__(self, binary=True,
                              tokenizer=identity,
                              preprocessor=None,
                              lowercase=False)
 def __init__(self, window=None, sentence_splitter="\n", directional=False, **args):
     self.window = window
     self.sentence_splitter = sentence_splitter
     self.directional = directional
     CountVectorizer.__init__(self, **args)