Ejemplo n.º 1
0
def load_embeddings(model_conf):
    word_vectors = os.path.join(BASE_PATH, "embeddings",
                                "{}.txt".format(model_conf["embeddings_file"]))
    word_vectors_size = model_conf["embed_dim"]

    # load word embeddings
    print("loading word embeddings...")
    return load_word_vectors(word_vectors, word_vectors_size)
Ejemplo n.º 2
0
def load_embeddings(model_conf,
                    absolute_path=False,
                    embedding_size_auto_detect=None):
    if not absolute_path:
        word_vectors = os.path.join(
            BASE_PATH, "embeddings",
            "{}.txt".format(model_conf["embeddings_file"]))
    else:
        '''Absolute Path.'''
        word_vectors = model_conf["embeddings_file"]

    if embedding_size_auto_detect is not None:
        word_vectors_size = detect_embedding_dim(word_vectors)
    else:
        word_vectors_size = model_conf["embed_dim"]

    # load word embeddings
    print("loading word embeddings...")
    return load_word_vectors(word_vectors, word_vectors_size)
Ejemplo n.º 3
0
    def read_embeddings(self, file, dim):
        """
        Create an Embeddings Matrix, in which each row corresponds to
        the word vector from the pretrained word embeddings.
        If a word is missing from the provided pretrained word vectors, then
        sample a new embedding, from the gaussian of the pretrained embeddings.

        Args:
            file:
            dim:

        Returns:

        """
        word2idx, idx2word, embeddings = load_word_vectors(file, dim)

        mu = embeddings.mean(axis=0)
        sigma = embeddings.std(axis=0)

        filtered_embeddings = numpy.zeros((len(self), embeddings.shape[1]))

        mask = numpy.zeros(len(self))
        missing = []

        for token_id, token in tqdm(self.id2tok.items(),
                                    desc="Reading embeddings...",
                                    total=len(self.id2tok.items())):
            if token not in word2idx or token == "<unk>":
                # todo: smart sampling per dim distribution
                # sample = numpy.random.uniform(low=-0.5, high=0.5,
                #                               size=embeddings.shape[1])
                sample = numpy.random.normal(mu, sigma / 4)
                filtered_embeddings[token_id] = sample

                mask[token_id] = 1
                missing.append(token_id)
            else:
                filtered_embeddings[token_id] = embeddings[word2idx[token]]

        print(f"Missing tokens from the pretrained embeddings: {len(missing)}")

        return filtered_embeddings, mask, missing
Ejemplo n.º 4
0
EMB_TRAINABLE = False
BATCH_SIZE = 128
EPOCHS = 40
DATASET = "MR"  # options: "MR", "Semeval2017A"

# if your computer has a CUDA compatible gpu use it, otherwise use the cpu
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

########################################################
# Define PyTorch datasets and dataloaders
########################################################

# load word embeddings
print("loading word embeddings...")
word2idx, idx2word, embeddings = load_word_vectors(EMBEDDINGS, EMB_DIM)

# load the raw data
if DATASET == "Semeval2017A":
    X_train, y_train, X_test, y_test = load_Semeval2017A()
elif DATASET == "MR":
    X_train, y_train, X_test, y_test = load_MR()
else:
    raise ValueError("Invalid dataset")

# ------------ #
#     EX1      #
# ------------ #
# Convert data labels from strings to integers
# create a new label encoder
le = LabelEncoder()
Ejemplo n.º 5
0
sentiment_yamls = os.path.join(YAML_PATH, 'sentiment')
sst_fine_grained_yamls = os.path.join(YAML_PATH, 'sst_fine_grained')
scv1_yamls = os.path.join(YAML_PATH, 'scv1')
scv2_yamls = os.path.join(YAML_PATH, 'scv2_gen')
psych_yamls = os.path.join(YAML_PATH, 'psychexp')

one_exp = False

yaml = "gating.yaml"
yamls_path = irony_yamls

#########################
# Load embeddings
#########################
if yamls_path is irony_yamls or sentiment_yamls:
    word2idx, idx2word, weights = load_word_vectors(
        os.path.join(EMB_DIR, "word2vec_300_6_20_neg.txt"),"300")
else:
    word2idx, idx2word, weights = load_word_vectors_from_fasttext(
        os.path.join(EMB_DIR, "wiki.en.vec"), "300")

#########################
# Run experiments
#########################
if one_exp:
    loss, acc, f1, precision, recall, f1_test, acc_test = clf_features_runner(os.path.join(sentiment_yamls,
                                                                        "{}".format(yaml)), word2idx, idx2word, weights, cluster=True)

    experiments = {'loss':loss, 'acc':acc, 'f1':f1, 'precision':precision, 'recall':recall,
                   'f1_test':f1_test, 'acc_test':acc_test}

    now = datetime.datetime.now().strftime("%y-%m-%d_%H:%M:%S")
psych_yamls = os.path.join(YAML_PATH, 'psychexp')
yaml = os.path.join(psych_yamls, "baseline.yaml")

opts, config = train_options(yaml)
device = opts.device
X_train, y_train, X_test, y_test = load_dataset(config["data"]["dataset"],
                                                test=True)

# load word embeddings
if config["data"]["embeddings"] == "wiki.en.vec":
    word2idx, idx2word, weights = load_word_vectors_from_fasttext(
        os.path.join(EMB_DIR, config["data"]["embeddings"]),
        config["data"]["embeddings_dim"])
else:
    word2idx, idx2word, weights = load_word_vectors(
        os.path.join(EMB_DIR, config["data"]["embeddings"]),
        config["data"]["embeddings_dim"])

checkpoint_name = "Psych_exp_baseline"

state = load_checkpoint(checkpoint_name)

# features, feat_length = load_features(config["data"]["features"])

test_set = ClfDataset(X_test, y_test, word2idx, name="psych_test")
test_lengths = [len(x) for x in test_set.data]
test_sampler = SortedSampler(test_lengths)
test_loader = DataLoader(test_set,
                         sampler=test_sampler,
                         batch_size=config["batch_size"],
                         num_workers=opts.cores,
Ejemplo n.º 7
0
# Bag-of-Words
# #############################################################
bow_clf = bow_model("clf", max_features=30000)
bow_clf.fit(X_train, y_train)
y_pred = bow_clf.predict(X_test)
bow_results = eval_clf(y_pred, y_test)

print("\n" + "#" * 40)
print("Bag-of-Words")
print("#" * 40)
for k, v in bow_results.items():
    print("{}:{:.4f}".format(k, v))

#############################################################
# Neural Bag-of-Words
#############################################################

file = os.path.join(BASE_PATH, "embeddings", "word2vec_300_6_20_neg.txt")
word2idx, idx2word, weights = load_word_vectors(file, 300)

nbow_clf = nbow_model("clf", weights, word2idx)
nbow_clf.fit(X_train, y_train)
y_pred = nbow_clf.predict(X_test)
nbow_results = eval_clf(y_pred, y_test)

print("\n" + "#" * 40)
print("Neural Bag-of-Words")
print("#" * 40)
for k, v in nbow_results.items():
    print("{}:{:.4f}".format(k, v))
Ejemplo n.º 8
0
from utils.load_embeddings import load_word_vectors

load_word_vectors("GoogleNews-vectors-negative300.txt", 300)
Ejemplo n.º 9
0
def submission(dataset, models=[], lm=[], gold=[]):

    X = load_test_wassa(dataset)

    with open("label_encoder.pkl", "rb") as f:
        label_encoder = pickle.load(f)

    # load embeddings
    file = os.path.join(BASE_PATH, "embeddings", "ntua_twitter_300.txt")
    word2idx, idx2word, weights = load_word_vectors(file, 300)

    dummy_y = [[0] * 6] * len(X)
    dummy_y = torch.tensor(dummy_y)

    posteriors_list = []
    predicted_list = []

    for i in range(0, len(models)):

        checkpoint_name = models[i]

        if lm[i]:
            model, optimizer, word2idx, idx2word, loss, acc, f1 = \
                load_checkpoint_pre_lm(checkpoint_name)
        else:
            model, optimizer, vocab, loss, acc, f1 = \
                load_checkpoint_with_f1(checkpoint_name)

        #####################################################################
        # Define Dataloaders
        #####################################################################
        preprocessor = twitter_preprocessor()

        # for new experiments remember to empty _cache!
        test_set = WordDataset(X,
                               dummy_y,
                               word2idx,
                               name="wassa_test_submit",
                               preprocess=preprocessor)
        sampler = SequentialSampler(test_set)

        test_loader = DataLoader(test_set, batch_size=32, sampler=sampler)

        #####################################################################
        # Load Trained Model
        #####################################################################
        model.eval()
        model.to(config.DEVICE)
        print(model)

        #####################################################################
        # Evaluate Trained Model on test set & Calculate predictions
        #####################################################################
        labels, predicted, posteriors = test_clf(model=model,
                                                 data_source=test_loader,
                                                 device=config.DEVICE)
        # pprint(labels)
        pprint(predicted)

        predicted_list.append(predicted)
        posteriors_list.append(posteriors)

    # pred, accuracy, f1  = ensemble_voting(predicted_list, gold, dataset)
    pred, accuracy, f1 = ensemble_posteriors(posteriors_list, gold, dataset)

    #####################################################################
    # Create submission file with the predictions3M_GU13__35_noconc_2att
    #####################################################################
    write_predictions(pred, label_encoder)
    return
Ejemplo n.º 10
0
    bioc.dump(collection, open(output, 'w'), pretty_print=True)


if __name__ == "__main__":
    # LOAD RAW DATA $ WORD VECTORS
    EVAL_DATASET = '../../dataset/PMtask_TestSet.xml'
    MODE = "eval"

    WV_PATH = '../../embeddings/PubMed-w2v.txt'
    WV_DIMS = 200
    MAX_SENT_LENGTH = 45
    MAX_SENTS = 23

    print("loading word embeddings...")
    word2idx, idx2word, embeddings = load_word_vectors(WV_PATH, WV_DIMS, True)

    docs, labels, ids = load_data(EVAL_DATASET, MODE)

    # convert strings to lists of tokens
    print("Tokenizing...")
    docs = [[text_to_word_sequence(sent) for sent in sent_tokenize(doc)]
            for doc in docs]

    # convert words to word indexes
    print("Vectorizing...")
    docs = [vectorize_doc(doc, word2idx, MAX_SENTS, MAX_SENT_LENGTH)
            for doc in docs]
    docs = numpy.array(docs)

    # LOAD SAVED MODEL
Ejemplo n.º 11
0
from utils.load_embeddings import load_word_vectors

load_word_vectors(
    r"D:\ruin\data\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.txt",
    300)
Ejemplo n.º 12
0
from model.task1.baseline_models import train_ei_reg, train_ei_oc, train_v_reg, \
    train_v_oc, train_e_c
from modules.sklearn.models import nbow_model, bow_model, eval_reg, eval_mclf
from utils.load_embeddings import load_word_vectors
from utils.nlp import twitter_preprocess

emb_files = [
    ("word2vec_300_6_20_neg.txt", 300),
    ("word2vec_300_6_concatened.txt", 310),
    ("word2vec_500_6_20_neg.txt", 500),
    ("word2vec_500_6_concatened.txt", 510),
]
embeddings = {}
for e, d in emb_files:
    file = os.path.join(BASE_PATH, "embeddings", e)
    word2idx, idx2word, weights = load_word_vectors(file, d)
    embeddings[e.split(".")[0]] = (weights, word2idx)

bow_clf = bow_model("clf")
bow_reg = bow_model("reg")
nbow_clf = {"nbow_{}".format(name): nbow_model("clf", e, w2i)
            for name, (e, w2i) in embeddings.items()}
nbow_reg = {"nbow_{}".format(name): nbow_model("reg", e, w2i)
            for name, (e, w2i) in embeddings.items()}

preprocessor = twitter_preprocess()

# ###########################################################################
# # 1. Task EI-reg: Detecting Emotion Intensity (regression)
# ###########################################################################
extral = False

discr = False
d = 0.6

unfreeze = True
freeze = {"embed": True, "hidden": True}

unfreeze_epoque = {"embed": 6, "hidden": 4}

# at which epoch the fine-tuning starts
name = "wassa_2M_ep2_GU_lr_weight_decay"

file = os.path.join(BASE_PATH, "embeddings", "ntua_twitter_300.txt")
_, _, weights = load_word_vectors(file, 300)

# load dataset
config = WASSA_WITH_PRETR_LM
config_lm = ConfLangModel

# Attention size needs to be equal to RNN size for Transfer Learning
if config['encoder_size'] != config_lm['rnn_size']:
    config['encoder_size'] = config_lm['rnn_size']
    print("Classifier RNN size needs to be equal to LM RNN size!")

X_train, X_test, y_train, y_test = load_wassa()

# 3 - convert labels from strings to integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(y_train)