def useful_words(ls): """ this function takes a list of strings and return a list with strings used more than ones """ bow_transformer = CountVectorizer().fit(ls) csr_matrix = bow_transformer.transform([" ".join(ls)]) tfidf_transfrom = TfidfTransformer().fit_transform(csr_matrix) """eliminate""" tmp_list = [] # tmp_list contains the elements to be eliminated Mc = tfidf_transfrom.tocoo() for i in Mc.col: if Mc.data[Mc.col == i][0] == Mc.data.min() and Mc.data.min() != Mc.data.max(): tmp_list.append(bow_transformer.get_feature_names()[i]) return list(set(ls) - set(tmp_list))
def get_tfidf(count_by_plz, vocab): print('building array...') counts = np.array([[count_by_plz[plz][vocab.get_word(i)] for plz in PLZ] for i in range(len(vocab))]) counts = counts.transpose() print('done, shape:', counts.shape) print('calculating TFIDF...') tfidf = TfidfTransformer().fit_transform(counts) print('done, type and shape:', type(tfidf), tfidf.shape) cx = tfidf.tocoo() tfidf_by_plz = {plz: defaultdict(int) for plz in PLZ} for i, j, v in zip(cx.row, cx.col, cx.data): tfidf_by_plz[PLZ[i]][vocab.get_word(j)] = v return tfidf_by_plz