Beispiel #1
0
def run_stopword_statistics(list_texts, N_s=100, path_stopword_list=None):
    '''
    Make a dataframe with ranking of words according to different metrics
    '''

    ## make csr matrix
    n_wd_csr, dict_w_iw = texts_nwd_csr(list_texts)
    V, D = n_wd_csr.shape

    ## get entropy measure
    result_H = nwd_H_shuffle(n_wd_csr, N_s=N_s)

    ## get tfidf
    arr_tfidf_w = nwd_tfidf_csr(n_wd_csr)

    ## make dataframe
    df = pd.DataFrame(index=sorted(list(dict_w_iw.keys())))

    df['F'] = result_H['F-emp']
    df['I'] = result_H['H-null-mu'] - result_H['H-emp']
    df['tfidf'] = arr_tfidf_w

    ## get stopword list if file with list was provided
    if path_stopword_list != None:
        with open(path_stopword_list, 'r') as f:
            x = f.readlines()
        stopwords = [h.strip() for h in x]
        arr_manual = np.zeros(V)
        for w in stopwords:
            try:
                iw = dict_w_iw[w]
                arr_manual[iw] = 1
            except KeyError:
                pass
        df['manual'] = arr_manual
        ## replace 0 by nan; such that these words will not be filtered
        df['manual'] = df['manual'].replace(to_replace=0, value=np.nan)

    ## get entropy and random entropy too
    df['H'] = result_H['H-emp']
    df['H-tilde'] = result_H['H-null-mu']
    df['H-tilde_std'] = result_H['H-null-std']
    df['N'] = np.array(n_wd_csr.sum(axis=1))[:, 0]  ## number of counts

    return df
Beispiel #2
0
def nwd_csr_shuffle(n_wd_csr):
    '''
    Obtain n_wd from shuffling tokens across documents.
    Gives n_wd from one random realization
    '''
    N_w = np.array(n_wd_csr.sum(axis=1).transpose())[0]
    N_d = np.array(n_wd_csr.sum(axis=0))[0]

    list_texts_flat = []
    for i_w, n_w in enumerate(N_w):
        list_texts_flat += [i_w] * n_w
    np.random.shuffle(list_texts_flat)

    list_texts_random = []
    n = 0
    for m in N_d:
        text_tmp = list_texts_flat[n:n + m]
        list_texts_random += [text_tmp]
        n += m

    ## this is the current bottleneck: takes 6 times longer than the shuffling
    ## given the row,col,data the csr-constructor takes all the time
    n_wd_csr_r, dict_w_iw_r = texts_nwd_csr(list_texts_random)
    return n_wd_csr_r
Beispiel #3
0
## Load data
# path_read = '../../data/'
fname_read = '%s.json' % (name_corpus)
filename = os.path.join(path_data, fname_read)

## read hanyus json-format
x_data = shi_json_to_texts(filename)  ## contains the data
## vocabulary and lsit of texts
list_words = x_data['list_w']
list_texts = x_data['list_texts']
dict_w_iw = x_data['dict_w_iw']
list_labels = x_data['list_c']
D = len(list_texts)

## sparse dataframe
X_csr, dict_w_iw = texts_nwd_csr(list_texts)
df = pd.SparseDataFrame(X_csr.transpose(),
                        index=np.arange(D),
                        columns=list_words)

## setup the stopwords stats
## run the stopwords statistics
path_stopword_list = os.path.join(path_data, 'stopword_list_%s' %
                                  (lang))  ## path to stopword list
N_s = 100  ## number of realizations
df_stop = run_stopword_statistics(list_texts,
                                  N_s=N_s,
                                  path_stopword_list=path_stopword_list)

## loop over fraction of tokens tormeove
cutoff_type = 'p'