def get_averages_for_tests(df, treatment):
    model_path = "word2vec_twitter_model/word2vec_twitter_model.bin"
    print("Loading the model, this can take some time...")
    model = Word2Vec.load_word2vec_format(model_path,
                                          binary=True,
                                          norm_only=True)
    df["text_vector"] = None
    matrix = []
    has_treatment = {}
    no_treatment = {}

    pca = PCA(n_components=85, random_state=1)
    for index, row in df.iterrows():
        text = row["text"]
        tokens = preprocess_tweet(text)
        vector = np.zeros(400)
        d = 0
        for token in tokens:
            if token in model.vocab:
                vector += model.syn0norm[model.vocab[token].index]
                d += 1
        if d != 0:
            vector = vector / d
            put = "OK"
            matrix.append(vector)
        else:
            put = "E"
        df.set_value(index, "text_vector", put)
    df = df[df.text_vector != "E"]
    df = df.reset_index(drop=True)
    matrix_reduced = pca.fit_transform(matrix)
    index_for_treated = 0
    index_for_non_treated = 0
    has_treatment_popularity = {}
    no_treatment_popularity = {}
    for index, row in df.iterrows():
        if row[treatment] == 1:
            has_treatment_popularity[index_for_treated] = row["popularity"]
            has_treatment[index_for_treated] = {}
            length = row["text_length"]
            time = row["time_of_day"]
            vector = matrix_reduced[index]
            has_treatment[index_for_treated]["length"] = length
            has_treatment[index_for_treated]["time_of_day"] = time
            for i, data_point in enumerate(vector):
                has_treatment[index_for_treated][i] = data_point
            index_for_treated += 1

        else:
            no_treatment_popularity[index_for_non_treated] = row["popularity"]
            no_treatment[index_for_non_treated] = {}
            length = row["text_length"]
            vector = matrix_reduced[index]
            no_treatment[index_for_non_treated]["length"] = length
            for i, data_point in enumerate(vector):
                no_treatment[index_for_non_treated][i] = data_point
            index_for_non_treated += 1
    return has_treatment_popularity, no_treatment_popularity, pd.DataFrame.from_dict(
        has_treatment, orient="index"), pd.DataFrame.from_dict(no_treatment,
                                                               orient="index")
Esempio n. 2
0
def get_averages_for_tests_images(df, keys):
    model_path = "word2vec_twitter_model/word2vec_twitter_model.bin"
    print("Loading the model, this can take some time...")
    model = Word2Vec.load_word2vec_format(model_path,
                                          binary=True,
                                          norm_only=True)
    has = df[df.images == 1]
    no = df[df.images == 0]
    has = has.reset_index(drop=True)
    no = no.reset_index(drop=True)
    tmp_no = no.groupby(["user.id"])[["popularity", "text"]].apply(convert)
    tmp_has = has.groupby(["user.id"])[["popularity", "text"]].apply(convert)
    tmp_no = create_vectors_from_sentence(tmp_no, model)
    tmp_has = create_vectors_from_sentence(tmp_has, model)
    has_treatment_data = create_joined_data_sets(tmp_has, has, keys)
    no_treatment_data = create_joined_data_sets(tmp_no, no, keys)
    tmp_has = tmp_has[["popularity"]]
    tmp_no = tmp_no[["popularity"]]
    return tmp_has.to_dict(orient="index"), tmp_no.to_dict(
        orient="index"), has_treatment_data, no_treatment_data
Esempio n. 3
0
def get_averages_for_tests_images(df):
    model_path = "../word2vec_twitter_model/word2vec_twitter_model.bin"
    print("Loading the model, this can take some time...")
    model = Word2Vec.load_word2vec_format(model_path,
                                          binary=True,
                                          norm_only=True)
    has = df[df.images == 1]
    no = df[df.images == 0]
    has["text_vector"] = None
    no["text_vector"] = None
    has = has.reset_index(drop=True)
    no = no.reset_index(drop=True)
    for index, row in has.iterrows():
        text = row["text"]
        tokens = preprocess_tweet(text)
        vector = np.zeros(400)
        d = 0
        for token in tokens:
            if token in model.vocab:
                vector += model.syn0norm[model.vocab[token].index]
                d += 1
        vector = vector / d
        has.set_value(index, "text_vector", vector)
    for index, row in no.iterrows():
        text = row["text"]
        tokens = preprocess_tweet(text)
        vector = np.zeros(400)
        d = 0
        for token in tokens:
            if token in model.vocab:
                vector += model.syn0norm[model.vocab[token].index]
                d += 1
        vector = vector / d
        no.set_value(index, "text_vector", vector)
    return has[["text_vector",
                "popularity"]], no[["text_vector", "popularity"]]
Esempio n. 4
0
from word2vec_platform.word2vecReader import Word2Vec
# from  gensim.models import Word2Vec
text = "donald"

model_path = "word2vec_twitter_model/word2vec_twitter_model.bin"
print("Loading the model, this can take some time...")
model = Word2Vec.load_word2vec_format(model_path, binary=True,norm_only=True)
print(len(model.syn0norm[model.vocab[text].index]))