Esempio n. 1
0
def train_tf_idf(file_list, **kwargs):
    # Default params
    tf_idf_params = {
        'input': 'filename', 
        'encoding': 'utf-8',
        'decode_error': 'replace',
        'strip_accents': 'unicode', 
        'lowercase': True,
        'analyzer': 'word',
        'stop_words': 'english', 
        'token_pattern': r'(?u)\b[a-zA-Z_][a-zA-Z0-9_]+\b',
        'ngram_range': (1, 2),
        'max_features':  5000, 
        'norm': 'l2',
        'use_idf': True,
        'smooth_idf': True,
        'sublinear_tf': True,
        'max_df': 1.0,
        'min_df': 1}
    
    # Update with kwargs if any
    tf_idf_params.update(kwargs)
    
    train_list = list(file_list) # creates copy
    shuffle(train_list)
    
    # compute tfidf vectors with scikits
    vectorizer = TfidfVectorizer(**tf_idf_params)
    vectorizer.fit(train_list)
    tf_idf_matrix = vectorizer.fit_transform(file_list)
    
    # CHange input type to content (string) dfor later
    vectorizer.input = 'content'
    
    return vectorizer, tf_idf_matrix