def get_vectorizer(f_x, header_x, nrows=None, max_features=100000): vname = ddir + 'joblib/marisa_vectorizer' vec = MarisaTfidfVectorizer(min_df=1, max_features=max_features, smooth_idf=True, norm='l2', sublinear_tf=False, use_idf=True, ngram_range=(1, 2)) vec.fit(iterText(f_x, header_x, nrows)) joblib.dump(vec, vname) return vec
def get_vectorizer(f_x,header_x,nrows=None,max_features=100000): vname = ddir+'joblib/marisa_vectorizer' vec = MarisaTfidfVectorizer( min_df = 1, max_features=max_features, smooth_idf=True, norm='l2', sublinear_tf=False, use_idf=True, ngram_range=(1,2)) vec.fit(iterText(f_x,header_x,nrows)) joblib.dump(vec,vname) return vec
def get_vectorizer_old(f_x, header_x, nrows=None, max_features=100000): STOPWORDS = [] with open('stop-words_french_1_fr.txt', "r") as f: STOPWORDS += f.read().split('\n') with open('stop-words_french_2_fr.txt', "r") as f: STOPWORDS += f.read().split('\n') STOPWORDS = set(STOPWORDS) vname = ddir + 'joblib/marisa_vectorizer' vec = MarisaTfidfVectorizer(min_df=1, max_features=123456, stop_words=STOPWORDS, strip_accents='unicode', smooth_idf=True, norm='l2', sublinear_tf=False, use_idf=True, ngram_range=(1, 3)) vec.fit(iterText_old(f_x, header_x, nrows)) joblib.dump(vec, vname) return vec
def get_vectorizer_old(f_x,header_x,nrows=None,max_features=100000): STOPWORDS = [] with open('stop-words_french_1_fr.txt', "r") as f: STOPWORDS += f.read().split('\n') with open('stop-words_french_2_fr.txt', "r") as f: STOPWORDS += f.read().split('\n') STOPWORDS = set(STOPWORDS) vname = ddir+'joblib/marisa_vectorizer' vec = MarisaTfidfVectorizer( min_df = 1, max_features=123456, stop_words=STOPWORDS, strip_accents = 'unicode', smooth_idf=True, norm='l2', sublinear_tf=False, use_idf=True, ngram_range=(1,3)) vec.fit(iterText_old(f_x,header_x,nrows)) joblib.dump(vec,vname) return vec