def tfidf(mtname): f = h5py.File('hdf5/' + mtname + '.hdf5', 'r') matrix = f[mtname][:] f.close() tfidf_transformer = TfidfTransformer() tfidf_transformer.set_params(norm=None) matrix = tfidf_transformer.fit_transform(matrix) f2 = h5py.File('hdf5/' + mtname + '_tfidf.hdf5', 'w') f2.create_dataset(mtname + '_tfidf', data=matrix.todense(), dtype='f4') f2.close() print('tf-idf performed')
def tfidf(mtname): f = h.File('hdf5/' + mtname + '.hdf5', 'r') matrix = f[mtname][:] f.close() tfidf_transformer = TfidfTransformer() tfidf_transformer.set_params(norm=None) matrix = tfidf_transformer.fit_transform(matrix) f2 = h.File('hdf5/' + mtname + '.hdf5', 'w') f2.create_dataset(mtname, data=matrix.todense(), dtype='f4') f2.close() print('tf-idf performed')
def ColumnAsBagOfWords(column, regex = None, save = False, load = False, applyTFIDF = True, path = '', **kwargs): """ Encode text column as bag of words. - colum (pd.Series): vector containing the text data. - regex (raw string): string with the tokenizer pattern. If not passed, a pattern where everything but "_", "." and " " is accepted as word is assumed - save (bool): bool that tells if the Vectorizer must be saved. - load (bool): bool that tells if the Vectorizer must be loaded. - applyTDIDF (bool): bool that tells if td-idf (term frequency–inverse document frequency) should be applied - path (string): path where the Vectorizer must be saved at or loaded from. - kwags(dict): dictionary of CountVectorizer/TfidfTransformer parameters to set. Return (pd.Dataframe, CountVectorizer, TfidfTransformer): Resulting dataframe, Count Vectorizer (scikit-learn), Tfidf Transformer (scikit-learn) """ # if directory doesnt exist, create it util.CheckAndCreatePath(path) if regex is None: regex = r"[^_^.^-]+" ColumnName = column.name # Apply bag of words vectorization if load: count_vect = pickle.load(open(path + "count_vectorizer_" + str(ColumnName) + ".pickle.dat", "rb")) else: count_vect = CountVectorizer(token_pattern = regex) # Capture count vectorizer parameters from kwags try: count_vect.set_params(**kwargs) except: pass columTransformed = count_vect.fit_transform(column) # Apply tf-idf(term frequency–inverse document frequency) if applyTFIDF: if load: tf_idf = pickle.load(open(path + "tf_idf_" + str(ColumnName) + ".pickle.dat", "rb")) else: tf_idf = TfidfTransformer(norm='l1', use_idf=True) # Capture Tfidf Transformer parameters from kwags try: tf_idf.set_params(**kwargs) except: pass columTransformed = tf_idf.fit_transform(columTransformed) else: tf_idf = None # create vector of column names columnNames = [(str(ColumnName) + '-' + str(i)) for i in count_vect.vocabulary_.keys()] # Save vectorizer and tf-idf transformer if save: pickle.dump(count_vect, open(path + "count_vectorizer_" + str(ColumnName) + ".pickle.dat", "wb")) if applyTFIDF: pickle.dump(tf_idf, open(path + "tf_idf_" + str(ColumnName) + ".pickle.dat", "wb")) # Construct final dataframe of transformed column df = pd.DataFrame(columTransformed.toarray(), columns=columnNames) return df, count_vect, tf_idf