def useful_words(ls):
    """
    this function takes a list of strings and return a list with strings used more than ones
    """
    bow_transformer = CountVectorizer().fit(ls)
    csr_matrix = bow_transformer.transform([" ".join(ls)])
    tfidf_transfrom = TfidfTransformer().fit_transform(csr_matrix)
    """eliminate"""
    tmp_list = []  # tmp_list contains the elements to be eliminated
    Mc = tfidf_transfrom.tocoo()
    for i in Mc.col:
        if Mc.data[Mc.col ==
                   i][0] == Mc.data.min() and Mc.data.min() != Mc.data.max():
            tmp_list.append(bow_transformer.get_feature_names()[i])
    return list(set(ls) - set(tmp_list))
Exemple #2
0
def get_tfidf(count_by_plz, vocab):
    print('building array...')
    counts = np.array([[count_by_plz[plz][vocab.get_word(i)] for plz in PLZ]
                       for i in range(len(vocab))])
    counts = counts.transpose()
    print('done, shape:', counts.shape)

    print('calculating TFIDF...')
    tfidf = TfidfTransformer().fit_transform(counts)
    print('done, type and shape:', type(tfidf), tfidf.shape)

    cx = tfidf.tocoo()
    tfidf_by_plz = {plz: defaultdict(int) for plz in PLZ}
    for i, j, v in zip(cx.row, cx.col, cx.data):
        tfidf_by_plz[PLZ[i]][vocab.get_word(j)] = v

    return tfidf_by_plz