Esempio n. 1
0
def test_vectorize_text():
    # no processing
    result = process_text.vectorize_text(text_df,
                                         "text",
                                         remove_stopwords=False,
                                         tfidf=False,
                                         lemma=False,
                                         lsa=False)
    assert len(result.columns) == 12

    # no stop words
    result = process_text.vectorize_text(text_df,
                                         "text",
                                         remove_stopwords=True,
                                         tfidf=False,
                                         lemma=False,
                                         lsa=False)
    assert "so" not in list(result.columns)

    result = process_text.vectorize_text(text_df,
                                         "text",
                                         remove_stopwords=False,
                                         tfidf=False,
                                         lemma=True,
                                         lsa=False)
    assert len(result.columns) == 12
    assert "be" in list(result.columns)
Esempio n. 2
0
TEXT_FILE_PATH = start.CLEAN_DATA_PATH + "text.csv"
ID = "id_attempt"
COLUMNS = ["text_clean"]

# %%

df = pd.read_csv(TEXT_FILE_PATH).set_index(ID)
df = df[COLUMNS]

# %%

# %% Version 1
matrix = process_text.vectorize_text(
    df,
    text_col="text_clean",
    remove_stopwords=True,
    tfidf=False,
    lemma=False,
    lsa=False,
)

file = start.RESULTS_PATH + "Pilot Study/Cosine Replicability 1.xlsx"
wb = load_workbook(file)
ws = wb.active

col = 2
for main in list(matrix.index):
    row = 2
    for comp in list(matrix.index):
        dist = 1 - scipy.spatial.distance.cosine(matrix.loc[main],
                                                 matrix.loc[comp])
        ws.cell(row=row, column=col).value = round(dist, 2)
Esempio n. 3
0
)
# Print the Keyword in the 10 topics
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

# %%
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

# %% CLUSTER
df = survey1[["text"]]

matrix = process_text.vectorize_text(df,
                                     text_col="text",
                                     remove_stopwords=True,
                                     tfidf=True)

num_clusters = 3
km = KMeans(n_clusters=num_clusters)
km.fit(matrix)
clusters = km.labels_.tolist()
df["cluster"] = clusters

grouped = df["text"].groupby(df["cluster"])

# %%
print("Top terms per cluster:")
print()
# sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]