if __name__ == "__main__": import sys #root_project = "/content/SaRaH/" root_project = "/Users/Alessandro/Dev/repos/SaRaH/" #root_project = "/home/jupyter/SaRaH/" sys.path.append(root_project) from src.data.utils import load_csv_to_dict dataset_path = root_project + 'dataset/haspeede2/preprocessed2/dev/dev.csv' w2v_bin_path = root_project + 'results/model/word2vec/twitter128.bin' dataset = load_csv_to_dict(dataset_path) new_set = set([word for words in dataset["tokens"] for word in words]) print("Unique items in corpora are:", len(new_set)) model = build_w2v(dataset["tokens"], size=128, window=5, min_count=1, sample=1e-4, negative=5, hs=0, workers=4, fine_tuning_on=w2v_bin_path, seed=1)
#PATH dataset_dev_path = root_project + "dataset/haspeede2/preprocessed/dev/dev.csv" dataset_test_tweets_path = root_project + "dataset/haspeede2/preprocessed/reference/reference_tweets.csv" w2v_bin_path = root_project + 'results/model/word2vec/twitter128.bin' #load word2vec and embedding_matrix w2v = KeyedVectors.load_word2vec_format(datapath(w2v_bin_path), binary=True) index_to_key, key_to_index = get_index_key_association(w2v) embedding_matrix, vocab_size = build_keras_embedding_matrix(w2v, index_to_key) WORD_EMB_SIZE = 128 VOCAB_SIZE = vocab_size #load dataset dictionary dataset_dev = load_csv_to_dict(dataset_dev_path) dataset_test_tweets = load_csv_to_dict(dataset_test_tweets_path) def load_data(dataset_dict, w2v, key_to_index, embedding_matrix, max_text_len): #TODO: deve ritornare anche tutto il resto, extra, lemma, stem, ... senteces = dataset_dict["tokens"] X = dataset_dict["tokens"] X = set_unkmark_token(X, w2v) X = get_int_seq(X, key_to_index) X = pad_sequences(X, maxlen=max_text_len, padding='post', truncating='post') X = np.array(X) y = np.array(dataset_dict["stereotype"]) return X, y def to_emb(X):