if __name__ == "__main__":

    import sys
    #root_project = "/content/SaRaH/"
    root_project = "/Users/Alessandro/Dev/repos/SaRaH/"
    #root_project = "/home/jupyter/SaRaH/"
    sys.path.append(root_project)

    from src.data.utils import load_csv_to_dict

    dataset_path = root_project + 'dataset/haspeede2/preprocessed2/dev/dev.csv'
    w2v_bin_path = root_project + 'results/model/word2vec/twitter128.bin'

    dataset = load_csv_to_dict(dataset_path)

    new_set = set([word for words in dataset["tokens"] for word in words])
    print("Unique items in corpora are:", len(new_set))

    model = build_w2v(dataset["tokens"],
                      size=128,
                      window=5,
                      min_count=1,
                      sample=1e-4,
                      negative=5,
                      hs=0,
                      workers=4,
                      fine_tuning_on=w2v_bin_path,
                      seed=1)
Exemple #2
0
#PATH
dataset_dev_path           = root_project + "dataset/haspeede2/preprocessed/dev/dev.csv"
dataset_test_tweets_path   = root_project + "dataset/haspeede2/preprocessed/reference/reference_tweets.csv"
w2v_bin_path               = root_project + 'results/model/word2vec/twitter128.bin'

#load word2vec and embedding_matrix
w2v = KeyedVectors.load_word2vec_format(datapath(w2v_bin_path), binary=True)
index_to_key, key_to_index = get_index_key_association(w2v)
embedding_matrix, vocab_size = build_keras_embedding_matrix(w2v, index_to_key)

WORD_EMB_SIZE = 128
VOCAB_SIZE = vocab_size


#load dataset dictionary
dataset_dev = load_csv_to_dict(dataset_dev_path)
dataset_test_tweets = load_csv_to_dict(dataset_test_tweets_path)


def load_data(dataset_dict, w2v, key_to_index, embedding_matrix, max_text_len):
    #TODO: deve ritornare anche tutto il resto, extra, lemma, stem, ...
    senteces = dataset_dict["tokens"]
    X = dataset_dict["tokens"]
    X = set_unkmark_token(X, w2v)
    X = get_int_seq(X, key_to_index)
    X = pad_sequences(X, maxlen=max_text_len, padding='post', truncating='post')
    X = np.array(X)
    y = np.array(dataset_dict["stereotype"])
    return X, y

def to_emb(X):