Ejemplo n.º 1
0
def get_vectorizer(f_x, header_x, nrows=None, max_features=100000):
    vname = ddir + 'joblib/marisa_vectorizer'
    vec = MarisaTfidfVectorizer(min_df=1,
                                max_features=max_features,
                                smooth_idf=True,
                                norm='l2',
                                sublinear_tf=False,
                                use_idf=True,
                                ngram_range=(1, 2))
    vec.fit(iterText(f_x, header_x, nrows))
    joblib.dump(vec, vname)
    return vec
Ejemplo n.º 2
0
def get_vectorizer(f_x,header_x,nrows=None,max_features=100000):
    vname = ddir+'joblib/marisa_vectorizer'
    vec = MarisaTfidfVectorizer(
        min_df = 1,
        max_features=max_features,
        smooth_idf=True,
        norm='l2',
        sublinear_tf=False,
        use_idf=True,
        ngram_range=(1,2))
    vec.fit(iterText(f_x,header_x,nrows))
    joblib.dump(vec,vname)
    return vec
Ejemplo n.º 3
0
def get_vectorizer_old(f_x, header_x, nrows=None, max_features=100000):
    STOPWORDS = []
    with open('stop-words_french_1_fr.txt', "r") as f:
        STOPWORDS += f.read().split('\n')
    with open('stop-words_french_2_fr.txt', "r") as f:
        STOPWORDS += f.read().split('\n')
    STOPWORDS = set(STOPWORDS)
    vname = ddir + 'joblib/marisa_vectorizer'
    vec = MarisaTfidfVectorizer(min_df=1,
                                max_features=123456,
                                stop_words=STOPWORDS,
                                strip_accents='unicode',
                                smooth_idf=True,
                                norm='l2',
                                sublinear_tf=False,
                                use_idf=True,
                                ngram_range=(1, 3))
    vec.fit(iterText_old(f_x, header_x, nrows))
    joblib.dump(vec, vname)
    return vec
Ejemplo n.º 4
0
def get_vectorizer_old(f_x,header_x,nrows=None,max_features=100000):
    STOPWORDS = []
    with open('stop-words_french_1_fr.txt', "r") as f:
        STOPWORDS += f.read().split('\n')
    with open('stop-words_french_2_fr.txt', "r") as f:
        STOPWORDS += f.read().split('\n')
    STOPWORDS = set(STOPWORDS)
    vname = ddir+'joblib/marisa_vectorizer'
    vec = MarisaTfidfVectorizer(
        min_df = 1,
        max_features=123456,
        stop_words=STOPWORDS,
        strip_accents = 'unicode',
        smooth_idf=True,
        norm='l2',
        sublinear_tf=False,
        use_idf=True,
        ngram_range=(1,3))
    vec.fit(iterText_old(f_x,header_x,nrows))
    joblib.dump(vec,vname)
    return vec