Example #1
0
def tfidf(mtname):
    f = h5py.File('hdf5/' + mtname + '.hdf5', 'r')
    matrix = f[mtname][:]
    f.close()
    tfidf_transformer = TfidfTransformer()
    tfidf_transformer.set_params(norm=None)
    matrix = tfidf_transformer.fit_transform(matrix)
    f2 = h5py.File('hdf5/' + mtname + '_tfidf.hdf5', 'w')
    f2.create_dataset(mtname + '_tfidf', data=matrix.todense(), dtype='f4')
    f2.close()
    print('tf-idf performed')
Example #2
0
def tfidf(mtname):
    f = h.File('hdf5/' + mtname + '.hdf5', 'r')
    matrix = f[mtname][:]
    f.close()
    tfidf_transformer = TfidfTransformer()
    tfidf_transformer.set_params(norm=None)
    matrix = tfidf_transformer.fit_transform(matrix)
    f2 = h.File('hdf5/' + mtname + '.hdf5', 'w')
    f2.create_dataset(mtname, data=matrix.todense(), dtype='f4')
    f2.close()
    print('tf-idf performed')
Example #3
0
def ColumnAsBagOfWords(column, regex = None, save = False, load = False, 
                       applyTFIDF = True, path = '', **kwargs):
    """
    Encode text column as bag of words.
    
    - colum (pd.Series): vector containing the text data.
    - regex (raw string): string with the tokenizer pattern. If not passed, a
    pattern where everything but "_", "." and " " is accepted as word is assumed
    - save (bool): bool that tells if the Vectorizer must be saved.
    - load (bool): bool that  tells if the Vectorizer must be loaded.
    - applyTDIDF (bool): bool that  tells if td-idf (term frequency–inverse document frequency)
    should be applied
    - path (string): path where the Vectorizer must be saved at or loaded from.
    - kwags(dict): dictionary of CountVectorizer/TfidfTransformer parameters to set.
    
    Return (pd.Dataframe, CountVectorizer, TfidfTransformer): Resulting dataframe,
    Count Vectorizer (scikit-learn), Tfidf Transformer (scikit-learn)
    """
    # if directory doesnt exist, create it
    util.CheckAndCreatePath(path)  
        
    if regex is None:
        regex = r"[^_^.^-]+"
    
    ColumnName = column.name
    
    # Apply bag of words vectorization
    if load:
        count_vect = pickle.load(open(path + "count_vectorizer_" + 
                                      str(ColumnName) + ".pickle.dat", "rb"))
    else:
        count_vect = CountVectorizer(token_pattern = regex)
    
    # Capture count vectorizer parameters from kwags
    try:
        count_vect.set_params(**kwargs)
    except:
        pass
        
    columTransformed = count_vect.fit_transform(column)
    
    # Apply tf-idf(term frequency–inverse document frequency)
    if applyTFIDF:
        if load:
            tf_idf = pickle.load(open(path + "tf_idf_" + 
                                      str(ColumnName) + ".pickle.dat", "rb"))
        else:
            tf_idf = TfidfTransformer(norm='l1', use_idf=True)
    
        # Capture Tfidf Transformer parameters from kwags
        try:
            tf_idf.set_params(**kwargs)
        except:
            pass
    
        columTransformed = tf_idf.fit_transform(columTransformed)
    else:
        tf_idf = None

    # create vector of column names    
    columnNames = [(str(ColumnName) + '-' + str(i)) for i in count_vect.vocabulary_.keys()]
    
    # Save vectorizer and tf-idf transformer
    if save:
        pickle.dump(count_vect, open(path + "count_vectorizer_" + 
                                  str(ColumnName) + ".pickle.dat", "wb"))
        if applyTFIDF:
            pickle.dump(tf_idf, open(path + "tf_idf_" + str(ColumnName) + 
                                     ".pickle.dat", "wb"))
    
    # Construct final dataframe of transformed column
    df = pd.DataFrame(columTransformed.toarray(), columns=columnNames)
    
    return df, count_vect, tf_idf