def get_averages_for_tests(df, treatment): model_path = "word2vec_twitter_model/word2vec_twitter_model.bin" print("Loading the model, this can take some time...") model = Word2Vec.load_word2vec_format(model_path, binary=True, norm_only=True) df["text_vector"] = None matrix = [] has_treatment = {} no_treatment = {} pca = PCA(n_components=85, random_state=1) for index, row in df.iterrows(): text = row["text"] tokens = preprocess_tweet(text) vector = np.zeros(400) d = 0 for token in tokens: if token in model.vocab: vector += model.syn0norm[model.vocab[token].index] d += 1 if d != 0: vector = vector / d put = "OK" matrix.append(vector) else: put = "E" df.set_value(index, "text_vector", put) df = df[df.text_vector != "E"] df = df.reset_index(drop=True) matrix_reduced = pca.fit_transform(matrix) index_for_treated = 0 index_for_non_treated = 0 has_treatment_popularity = {} no_treatment_popularity = {} for index, row in df.iterrows(): if row[treatment] == 1: has_treatment_popularity[index_for_treated] = row["popularity"] has_treatment[index_for_treated] = {} length = row["text_length"] time = row["time_of_day"] vector = matrix_reduced[index] has_treatment[index_for_treated]["length"] = length has_treatment[index_for_treated]["time_of_day"] = time for i, data_point in enumerate(vector): has_treatment[index_for_treated][i] = data_point index_for_treated += 1 else: no_treatment_popularity[index_for_non_treated] = row["popularity"] no_treatment[index_for_non_treated] = {} length = row["text_length"] vector = matrix_reduced[index] no_treatment[index_for_non_treated]["length"] = length for i, data_point in enumerate(vector): no_treatment[index_for_non_treated][i] = data_point index_for_non_treated += 1 return has_treatment_popularity, no_treatment_popularity, pd.DataFrame.from_dict( has_treatment, orient="index"), pd.DataFrame.from_dict(no_treatment, orient="index")
def get_averages_for_tests_images(df, keys): model_path = "word2vec_twitter_model/word2vec_twitter_model.bin" print("Loading the model, this can take some time...") model = Word2Vec.load_word2vec_format(model_path, binary=True, norm_only=True) has = df[df.images == 1] no = df[df.images == 0] has = has.reset_index(drop=True) no = no.reset_index(drop=True) tmp_no = no.groupby(["user.id"])[["popularity", "text"]].apply(convert) tmp_has = has.groupby(["user.id"])[["popularity", "text"]].apply(convert) tmp_no = create_vectors_from_sentence(tmp_no, model) tmp_has = create_vectors_from_sentence(tmp_has, model) has_treatment_data = create_joined_data_sets(tmp_has, has, keys) no_treatment_data = create_joined_data_sets(tmp_no, no, keys) tmp_has = tmp_has[["popularity"]] tmp_no = tmp_no[["popularity"]] return tmp_has.to_dict(orient="index"), tmp_no.to_dict( orient="index"), has_treatment_data, no_treatment_data
def get_averages_for_tests_images(df): model_path = "../word2vec_twitter_model/word2vec_twitter_model.bin" print("Loading the model, this can take some time...") model = Word2Vec.load_word2vec_format(model_path, binary=True, norm_only=True) has = df[df.images == 1] no = df[df.images == 0] has["text_vector"] = None no["text_vector"] = None has = has.reset_index(drop=True) no = no.reset_index(drop=True) for index, row in has.iterrows(): text = row["text"] tokens = preprocess_tweet(text) vector = np.zeros(400) d = 0 for token in tokens: if token in model.vocab: vector += model.syn0norm[model.vocab[token].index] d += 1 vector = vector / d has.set_value(index, "text_vector", vector) for index, row in no.iterrows(): text = row["text"] tokens = preprocess_tweet(text) vector = np.zeros(400) d = 0 for token in tokens: if token in model.vocab: vector += model.syn0norm[model.vocab[token].index] d += 1 vector = vector / d no.set_value(index, "text_vector", vector) return has[["text_vector", "popularity"]], no[["text_vector", "popularity"]]
from word2vec_platform.word2vecReader import Word2Vec # from gensim.models import Word2Vec text = "donald" model_path = "word2vec_twitter_model/word2vec_twitter_model.bin" print("Loading the model, this can take some time...") model = Word2Vec.load_word2vec_format(model_path, binary=True,norm_only=True) print(len(model.syn0norm[model.vocab[text].index]))