def prepare_vectorizer(max_n_gram_size, feature_threshold=50000): vect = CountVectorizer(ngram_range=(1, max_n_gram_size)) stemmer = FrenchStemmer() stop_words = set(stopwords.words('french')) stop_words.add("bonjour") stop_words.add("salut") stop_words.add("merci") stop_words.add("non") tfidf_vectorizer = TfidfVectorizer(analyzer=lambda text: vect._word_ngrams( tokenize(stemmer, stop_words, text)), max_features=feature_threshold) return tfidf_vectorizer, lambda text: tokenize(stemmer, stop_words, text)
def python_analyzer(doc): words = [] not_found = [] #print(vectorizer_params) vectorizer = CountVectorizer(**vectorizer_params) file = io.StringIO(doc) try: for token in tokenize.generate_tokens(file.readline): token_type = tokenize.tok_name[token[0]] # Redundant conditional to make sure we're getting all the token types if token_type not in removed_itens: if token_type in allowed_itens: # If it's a variable or reserved name, keep it if token_type == "NAME": words.append(token[1]) elif token_type == "INDENT": # Adding indent for all indentations words.append("is_indent") elif token_type == "DEDENT": # Adding dedent for all indentations words.append("is_dedent") elif token_type == "STRING": # Adding is_string for every string words.append("is_string") elif token_type == "NUMBER": # Adding is_number for every number: words.append("is_number") elif token_type == "OP": # If it's operator, then we'll divide in several types lookup = { "+": "is_op_arit", "+=": "is_op_arit", "-": "is_op_arit", "*": "is_op_arit", "**": "is_op_arit", "/": "is_op_arit", "//": "is_op_arit", "%": "is_op_arit", ">": "is_op_logic", "<": "is_op_logic", ">=": "is_op_logic", "<=": "is_op_logic", "==": "is_op_logic", "-=": "is_op_logic", "!=": "is_op_logic", "[": "is_list", # "]": "is_list", "{": "is_dict", # "}": "is_dict", ".": "is_class", "=": "is_attribution", ":": "is_block" } try: words.append(lookup[token[1]]) except KeyError: not_found.append(token[1]) # print("not found: %s" % set(not_found)) except (IndentationError, tokenize.TokenError): pass return vectorizer._word_ngrams(words)