Python CountVectorizer._word_ngrams Examples

Programming Language: Python

Namespace/Package Name: sklearn.feature_extraction.text

Class/Type: CountVectorizer

Method/Function: _word_ngrams

Examples at hotexamples.com: 2

Python CountVectorizer._word_ngrams - 2 examples found. These are the top rated real world Python examples of sklearn.feature_extraction.text.CountVectorizer._word_ngrams extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CountVectorizer(30)

_validate_vocabulary(30)

fit_transform(30)

fit(30)

build_tokenizer(30)

build_analyzer(30)

get_stop_words(30)

get_params(21)

get_feature_names_out(15)

build_preprocessor(13)

__init__(10)

get_feature_names(9)

dictionary_freeze(6)

count(4)

analyzer(4)

fixed_vocabulary(3)

astype(3)

_count_vocab(2)

copy(2)

fit_trainsform(2)

get_features_names(2)

append(2)

_word_ngrams(2)

get_feature_name(1)

getSenVec(1)

_sort_features(1)

get_features(1)

get_sentence_vector(1)

get_shape(1)

getOutputCol(1)

fit_Transform(1)

fit_trasform(1)

fit_transfrom(1)

fit_transforn(1)

__repr__(1)

fir_transform(1)

__dict__(1)

extract_ngrams(1)

delete_temporary_training_data(1)

count_features(1)

_limit_features(1)

fir(1)

Example #1

Show file

def prepare_vectorizer(max_n_gram_size, feature_threshold=50000):
    vect = CountVectorizer(ngram_range=(1, max_n_gram_size))
    stemmer = FrenchStemmer()
    stop_words = set(stopwords.words('french'))

    stop_words.add("bonjour")
    stop_words.add("salut")
    stop_words.add("merci")
    stop_words.add("non")

    tfidf_vectorizer = TfidfVectorizer(analyzer=lambda text: vect._word_ngrams(
        tokenize(stemmer, stop_words, text)),
                                       max_features=feature_threshold)
    return tfidf_vectorizer, lambda text: tokenize(stemmer, stop_words, text)

Example #2

Show file

def python_analyzer(doc):
    words = []
    not_found = []
    #print(vectorizer_params)
    vectorizer = CountVectorizer(**vectorizer_params)
    file = io.StringIO(doc)
    try:
        for token in tokenize.generate_tokens(file.readline):
            token_type = tokenize.tok_name[token[0]]

            # Redundant conditional to make sure we're getting all the token types
            if token_type not in removed_itens:
                if token_type in allowed_itens:
                    # If it's a variable or reserved name, keep it
                    if token_type == "NAME":
                        words.append(token[1])
                    elif token_type == "INDENT":
                        # Adding indent for all indentations
                        words.append("is_indent")
                    elif token_type == "DEDENT":
                        # Adding dedent for all indentations
                        words.append("is_dedent")
                    elif token_type == "STRING":
                        # Adding is_string for every string
                        words.append("is_string")
                    elif token_type == "NUMBER":
                        # Adding is_number for every number:
                        words.append("is_number")
                    elif token_type == "OP":
                        # If it's operator, then we'll divide in several types
                        lookup = {
                            "+": "is_op_arit",
                            "+=": "is_op_arit",
                            "-": "is_op_arit",
                            "*": "is_op_arit",
                            "**": "is_op_arit",
                            "/": "is_op_arit",
                            "//": "is_op_arit",
                            "%": "is_op_arit",
                            ">": "is_op_logic",
                            "<": "is_op_logic",
                            ">=": "is_op_logic",
                            "<=": "is_op_logic",
                            "==": "is_op_logic",
                            "-=": "is_op_logic",
                            "!=": "is_op_logic",
                            "[": "is_list",
                            #                         "]": "is_list",
                            "{": "is_dict",
                            #                         "}": "is_dict",
                            ".": "is_class",
                            "=": "is_attribution",
                            ":": "is_block"
                        }
                        try:
                            words.append(lookup[token[1]])
                        except KeyError:
                            not_found.append(token[1])
    #     print("not found: %s" % set(not_found))
    except (IndentationError, tokenize.TokenError):
        pass
    return vectorizer._word_ngrams(words)