Python glove2dict Exemples, utils.glove2dict Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : feature_extractors.py Projet : ptoman/icgauge

def derive_pca_on_glove(paragraph, num_dimensions_to_accumulate):
  """ Helper function to get PCA decomposition of GLOVE vectors in paragraph """
  global glove
  if glove == None:
    glove = utils.glove2dict(os.path.join(path_to_glove, 
            'glove.6B.%dd.txt' % GLOVE_SIZE))

  approx_num_words = int(1.5*len(paragraph.split()))
  word_vector = np.zeros((approx_num_words, GLOVE_SIZE))
  row = 0
  words = []
  for sent in sent_tokenize(paragraph):
    for word in word_tokenize(sent):
      word = word.lower()
      if word in glove and word not in words:
        word_vector[row] = glove[word]
        words.append(word)
        row += 1
  word_vector = word_vector[:row,:]
  #print words

  pca = PCA(n_components = num_dimensions_to_accumulate)
  pca.fit(word_vector)

  return pca

Exemple #2

0

Afficher le fichier

def createEmbedMatrix(embeddingFileName="../../../glove.6B.50d.txt"):
    vocab = utils.glove2dict(embeddingFileName)
    emb_matrix = np.zeros((len(vocab) + 2, vocab["the"].shape[0]))
    word2Index = {}
    index2Word = {}
    index = 0
    for word in vocab:
        emb_matrix[index] = vocab[word]
        word2Index[word] = index
        index2Word[index] = word
        index += 1
    # Add UNK
    word2Index["<UNK>"] = len(word2Index)
    index2Word[word2Index["<UNK>"]] = "<UNK>"
    unkValue = []
    # pick unkValue as random values
    for i in range(0, vocab["the"].size):
        randomKey = random.choice(list(vocab.keys()))
        unkValue.append(vocab[randomKey][i])
    emb_matrix[word2Index["<UNK>"]] = np.asarray(unkValue)
    # Add PAD
    word2Index["<PAD>"] = len(word2Index)
    index2Word[word2Index["<PAD>"]] = "<PAD>"
    emb_matrix[word2Index["<PAD>"]] = np.zeros_like(vocab["the"])
    return emb_matrix, word2Index, index2Word

Exemple #3

0

Afficher le fichier

def analysis3(
    fileName,
    gloveFileName="../../../../../../GitHub/cs224u/vsmdata/glove/glove.6B.50d.txt"
):
    included = 0
    count = []
    vocab = utils.glove2dict(
        gloveFileName)  # dict[word] -> numpy array(embed_dim,)
    with open(fileName) as f:
        lineNum = 0
        for line in f:
            if lineNum % 4 != 0:
                lineNum += 1
                continue
            firstStartIdx = line.find('<e1>') + 4
            firstEndIdx = line.find('</e1>')
            secondStartIdx = line.find('<e2>') + 4
            secondEndIdx = line.find('</e2>')
            if firstStartIdx == -1 or firstEndIdx == -1 or secondStartIdx == -1 or secondEndIdx == -1:
                print("ERROR")
            firstEntity = line[firstStartIdx:firstEndIdx]
            secondEntity = line[secondStartIdx:secondEndIdx]
            #print(firstEntity, secondEntity)
            if firstEntity not in vocab:
                count.append(firstEntity)
            else:
                included += 1
            if secondEntity not in vocab:
                count.append(secondEntity)
            else:
                included += 1
            lineNum += 1
    print(included)
    print(len(count))

Exemple #4

0

Afficher le fichier

def derive_pca_on_glove(paragraph, num_dimensions_to_accumulate):
    """ Helper function to get PCA decomposition of GLOVE vectors in paragraph """
    global glove
    if glove == None:
        glove = utils.glove2dict(
            os.path.join(path_to_glove, 'glove.6B.%dd.txt' % GLOVE_SIZE))

    approx_num_words = int(1.5 * len(paragraph.split()))
    word_vector = np.zeros((approx_num_words, GLOVE_SIZE))
    row = 0
    words = []
    for sent in sent_tokenize(paragraph):
        for word in word_tokenize(sent):
            word = word.lower()
            if word in glove and word not in words:
                word_vector[row] = glove[word]
                words.append(word)
                row += 1
    word_vector = word_vector[:row, :]
    #print words

    pca = PCA(n_components=num_dimensions_to_accumulate)
    pca.fit(word_vector)

    return pca

Exemple #5

0

Afficher le fichier

def dimensional_decomposition(paragraph,
                              unused_parse,
                              num_dimensions_to_accumulate=5):
    """ Gets the extent to which the word embeddings used in a paragraph
  can be reduced to to low-dimensional space.  Low-dimensional space is
  derived by PCA.

  Central insight:
     If the words all lay in a low-dimensional space, then likely
     they are expressing variations on the same theme rather 
     than nuanced argument.  Confounding variable of paragraph length
     actually seems good here.

  Empirical results:
     This works, and it captures something about content rather than
     just matching on highly functional words and phrases.
     On toy dataset, the first dimension has a correlation of -0.51
     with the score -- this is extremely encouraging!  The second-fifth
     cumulative dimensions are even stronger, all at about -0.56.

     Best to use a smaller number of dimensions (like 50), else we risk
     a highly overdetermined system of equations when heading into PCA.

  Returns:
     dictionary, with keys:
        cum_pca_var_expl_# (where # is in range 
           [0, num_dimensions_to_accumulate)); each value gives the amount
           of variance explained when that number of dimensions 
           is considered
  """
    global glove
    if glove == None:
        glove = utils.glove2dict(
            os.path.join(path_to_glove, 'glove.6B.%dd.txt' % GLOVE_SIZE))

    approx_num_words = int(1.5 * len(paragraph.split()))
    word_vector = np.zeros((approx_num_words, GLOVE_SIZE))
    row = 0
    words = []
    for sent in sent_tokenize(paragraph):
        for word in word_tokenize(sent):
            word = word.lower()
            if word in glove and word not in words:
                word_vector[row] = glove[word]
                words.append(word)
                row += 1
    word_vector = word_vector[:row, :]
    #print words

    pca = PCA()
    pca.fit(word_vector)

    features = Counter()
    for i in range(num_dimensions_to_accumulate):
        features["cum_pca_var_expl_%d" % i] = np.sum(
            pca.explained_variance_ratio_[:i + 1])

    return features

Exemple #6

0

Afficher le fichier

def load_data_embedding(glove6B=False):
    X_train, y_train = build_dataset('train', max_sen_length)
    X_dev, y_dev = build_dataset('dev', max_sen_length)
    embedding_weights = np.zeros((vocab_size, vocab_dim))
    if vocab_dim == 50:
        GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.50d.txt'))
    elif vocab_dim == 100:
        GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.100d.txt'))
    elif vocab_dim == 200:
        GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.200d.txt'))
    elif vocab_dim == 300:
        if glove6B:
            GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.300d.txt'))
        else:
            glove_home = 'glove_dir/glove.840B'
            GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.840B.300d.txt'))
    for word, index in wordMap.items():
        if word in GLOVE:
            embedding_weights[index, :] = GLOVE[word]
        else:
            embedding_weights[index, :] = utils.randvec(vocab_dim)
    return X_train, y_train, X_dev, y_dev, embedding_weights

Exemple #7

0

Afficher le fichier

Fichier : model_v1_tf.py Projet : nikhilxb/cs224u-project

def loadData():
    vocab = utils.glove2dict(
        "../../../glove.6B.50d.txt")  # dict[word] -> numpy array(embed_dim,)
    # self.embedding_matrix: (embed_dim, vocab_size)
    embedding_matrix = np.zeros((len(vocab["and"]), len(vocab)))
    word2Index = {}
    index2Word = {}
    counter = 0
    for word, vec in vocab.items():
        embedding_matrix[:, counter] = vec
        word2Index[word] = counter
        index2Word[counter] = word
        counter += 1
    return embedding_matrix, word2Index, index2Word

Exemple #8

0

Afficher le fichier

def loadData(
    gloveFileName="../../../../../../GitHub/cs224u/vsmdata/glove/glove.6B.50d.txt"
):
    vocab = utils.glove2dict(
        gloveFileName)  # dict[word] -> numpy array(embed_dim,)
    embedding_matrix = np.zeros((len(vocab["and"]), len(vocab)))
    word2Index = {}
    index2Word = {}
    counter = 0
    for word, vec in vocab.items():
        embedding_matrix[:, counter] = vec
        word2Index[word] = counter
        index2Word[counter] = word
        counter += 1
    return embedding_matrix, word2Index, index2Word

Exemple #9

0

Afficher le fichier

Fichier : feature_embedding.py Projet : JaneShuoZhang/OffensiveLanguageDetection

def generate_glove_embedding(dim=300):
    if dim not in GLOVE_EMBEDDING_FILE:
        print("GloVe file of dim {} is not found.".format(dim))
        return None

    glove_file_path = os.path.join(EMBEDDING_FOLDER, GLOVE_EMBEDDING_FILE[dim])
    glove_lookup = glove2dict(glove_file_path, dim)

    vocal = ['$UNK']
    embedding = [list(np.random.uniform(low=-1.0, high=1.0, size=dim))]
    for key, value in glove_lookup.items():
        vocal.append(key)
        embedding.append(list(value))

    embedding = np.array(embedding, dtype=np.float)
    print("Vocabulary size: {}, Embedding size{}".format(len(vocal), embedding.shape))
    return [vocal, embedding]

Exemple #10

0

Afficher le fichier

Fichier : feature_embedding.py Projet : JaneShuoZhang/OffensiveLanguageDetection

def build_glove_featurized_dataset(df, dim=300, np_func=np.sum):
    if dim not in GLOVE_EMBEDDING_FILE:
        print("GloVe file of dim {} is not found.".format(dim))
        return None
    
    glove_file_path = os.path.join(EMBEDDING_FOLDER, GLOVE_EMBEDDING_FILE[dim])
    glove_lookup = glove2dict(glove_file_path, dim)

    feature_matrix = []
    labels = []

    for index, row in df.iterrows():
        feature_matrix.append(glove_featurizer(row['tweet'], glove_lookup, np_func))
        encoded_label = 1 if row['subtask_a'] == 'OFF' else 0
        labels.append(encoded_label)

    return {'X': np.array(feature_matrix),
            'y': labels}

Exemple #11

0

Afficher le fichier

Fichier : hw_colors_trials1.py Projet : abgoswam/cs224u

def create_glove_embedding(vocab, glove_base_filename='glove.6B.50d.txt'):

    # Use `utils.glove2dict` to read in the GloVe file:
    ##### YOUR CODE HERE
    glove2dict = utils.glove2dict(os.path.join(GLOVE_HOME,
                                               glove_base_filename))

    # Use `utils.create_pretrained_embedding` to create the embedding.
    # This function will, by default, ensure that START_TOKEN,
    # END_TOKEN, and UNK_TOKEN are included in the embedding.
    ##### YOUR CODE HERE
    embedding, vocab = utils.create_pretrained_embedding(glove2dict, vocab)

    # Be sure to return the embedding you create as well as the
    # vocabulary returned by `utils.create_pretrained_embedding`,
    # which is likely to have been modified from the input `vocab`.

    ##### YOUR CODE HERE
    return embedding, vocab

Exemple #12

0

Afficher le fichier

    def __init__(self, opt, train):

        self.opt = opt
        self.train = train

        if self.opt.use_pretrained_embeddings:
            self.glv_dict = glove2dict(self.opt.glove_dir)

        self.preprocess_sentences()
        self.get_words()

        if self.opt.use_pretrained_embeddings:
            self.create_word_dict_glove()
            self.create_word_count_glove()
        else:
            self.create_word_dict()
            self.create_word_count()
        self.len = len(self.sentences)

        del self.sentences_all

Exemple #13

0

Afficher le fichier

Fichier : single_file_str_sentences.py Projet : josepcarnerm/text-generation

    def __init__(self, opt, train):
        self.opt = opt
        self.train = train

        self.file = open(self.opt.data_dir + self.opt.input_file,
                         "r",
                         encoding='utf-8',
                         errors='ignore').read()

        if self.opt.use_pretrained_embeddings:
            self.glv_dict = glove2dict(self.opt.glove_dir)

        self.preprocess_sentences()
        self.get_words()

        if self.opt.use_pretrained_embeddings:
            self.create_word_dict_glove()
            self.create_word_count_glove()
        else:
            self.create_word_dict()
            self.create_word_count()
        self.len = len(self.sentences)

Exemple #14

0

Afficher le fichier

Fichier : word_embeddings_classifier.py Projet : henryln1/CS224U-Project

import utils
import random
import os
import tensorflow as tf
from tf_rnn_classifier import TfRNNClassifier
from collections import Counter

glove_dimensionality = 100

home = '..'
base_glove = 'glove.6B'
glove_home = os.path.join(home, base_glove)

glove_text_file = '.' + str(glove_dimensionality) + 'd.txt'

glove_lookup = utils.glove2dict(glove_home + glove_text_file)

def get_vocab(X, n_words=None):
    """Get the vocabulary for an RNN example matrix `X`,
    adding $UNK$ if it isn't already present.

    Parameters
    ----------
    X : list of lists of str
    n_words : int or None
        If this is `int > 0`, keep only the top `n_words` by frequency.

    Returns
    -------
    list of str

Exemple #15

0

Afficher le fichier

Fichier : sst_03_neural_networks_trials1.py Projet : abgoswam/cs224u

def fit_maxent_classifier(X, y):
    mod = LogisticRegression(fit_intercept=True,
                             solver='liblinear',
                             multi_class='auto')
    mod.fit(X, y)
    return mod


# ### GloVe inputs
#
# To illustrate this process, we'll use the general purpose GloVe representations released by the GloVe team, at 300d:

# In[6]:

glove_lookup = utils.glove2dict(os.path.join(GLOVE_HOME, 'glove.6B.300d.txt'))

# In[7]:


def vsm_leaves_phi(tree, lookup, np_func=np.sum):
    """Represent `tree` as a combination of the vector of its words.
    
    Parameters
    ----------
    tree : nltk.Tree   
    lookup : dict
        From words to vectors.
    np_func : function (default: np.sum)
        A numpy matrix operation that can be applied columnwise, 
        like `np.mean`, `np.sum`, or `np.prod`. The requirement is that

Exemple #16

0

Afficher le fichier

Fichier : run_trnn_exp.py Projet : duanzhihua/cs224u-2

# Basically everything from the Jupyter notebook, but in a single runnable file.

import numpy as np
import os
import sst
import tf_trnn
import tf_lifted_trnn
import time
import utils

vsmdata_home = 'glove.6B'
glove_home = vsmdata_home
glove_lookup_100 = utils.glove2dict(
    os.path.join(glove_home, 'glove.6B.100d.txt'))
'''
glove_lookup_200 = utils.glove2dict(
    os.path.join(glove_home, 'glove.6B.200d.txt'))
glove_lookup_300 = utils.glove2dict(
    os.path.join(glove_home, 'glove.6B.300d.txt'))
'''
glove_lookup_200 = {}
glove_lookup_300 = {}


def run_experiment(eta, embed, model, phrase):
    print("===================================================")
    print("eta: %s, embed_dim: %s, model: %s, phrase-level: %s" %
          (eta, embed, model, phrase))
    print("===================================================")

    if embed == 100:

Exemple #17

0

Afficher le fichier

def randvec(w, n=50, lower=-1.0, upper=1.0):
    """Returns a random vector of length `n`. `w` is ignored."""
    return utils.randvec(n=n, lower=lower, upper=upper)


# In[18]:

# Any of the files in glove.6B will work here:

glove_dim = 50

glove_src = os.path.join(GLOVE_HOME, 'glove.6B.{}d.txt'.format(glove_dim))

# Creates a dict mapping strings (words) to GloVe vectors:
GLOVE = utils.glove2dict(glove_src)


def glove_vec(w):
    """Return `w`'s GloVe representation if available, else return 
    a random vector."""
    return GLOVE.get(w, randvec(w, n=glove_dim))


# ### Combining words into inputs: vector_combo_func

# Here we decide how to combine the two word vectors into a single representation. In more detail, where `u` is a vector representation of the left word and `v` is a vector representation of the right word, we need a function `vector_combo_func` such that `vector_combo_func(u, v)` returns a new input vector `z` of dimension `m`. A simple example is concatenation:

# In[19]:

Exemple #18

0

Afficher le fichier

def convertSentencesToIdx(
    relation2IdFileName="SemEval2010_task8_all_data/cleaned/cleaned_entity2Id.txt",
    trainFileName="SemEval2010_task8_all_data/SemEval2010_task8_training/small_TRAIN_FILE.TXT",
    gloveFileName="../../../../../../GitHub/cs224u/vsmdata/glove/glove.6B.50d.txt"
):
    vocab = utils.glove2dict(
        gloveFileName)  # dict[word] -> numpy array(embed_dim,)
    word2Index = {}
    createWord2Index(vocab, word2Index)
    addUnk(vocab, word2Index)
    addPadding(vocab, word2Index)
    writeDictToFile(word2Index)
    relation2Id = {}
    readDictFromFile(relation2Id, relation2IdFileName)
    count = 0
    cleanedFile = open("SemEval2010_task8_all_data/cleaned/cleaned_train.txt",
                       'w+')
    with open(trainFileName) as f:
        lineNum = 0
        table = str.maketrans(
            {key: None
             for key in string.punctuation if key != "-"})
        for line in f:
            if lineNum % 4 == 2 or lineNum % 4 == 3:
                lineNum += 1
                continue
            elif lineNum % 4 == 1:
                lineNum += 1
                lineToWrite = relation2Id[line[:line.find("(")]] + lineToWrite
                cleanedFile.write(lineToWrite)
                continue
            lineCount = getLineCount(line)
            # Remove all the words in the middle of brackets
            firstStartIdx = line.find('<e1>') + len("<e1>")
            firstEndIdx = line.find('</e1>')
            firstEntity = line[firstStartIdx:firstEndIdx + 5]
            line = line.replace(firstEntity, "")
            secondStartIdx = line.find('<e2>') + len("<e2>")
            secondEndIdx = line.find('</e2>')
            secondEntity = line[secondStartIdx:secondEndIdx + 5]
            line = line.replace(secondEntity, "")
            line = line.translate(table)
            line = line.lower()
            tokens = line.split()
            lineToWrite = ""
            # Write sentence id
            lineToWrite += " " + str(tokens[0])
            for token in tokens[1:]:
                lineToWrite += " "
                if "e1" in token:
                    lineToWrite += "<e1>"
                elif "e2" in token:
                    lineToWrite += "<e2>"
                elif token in word2Index:
                    lineToWrite += str(word2Index[token])
                else:
                    lineToWrite += str(word2Index["<UNK>"])
            lineToWrite += ("\n")
            lineNum += 1
            '''
			# Sanity check:
			if "<e1>" not in lineToWrite or "<e2>" not in lineToWrite:
				print("ERROR")
				print(lineToWrite)
			if lineCount != len(lineToWrite.split()):
				count += 1
				print(lineCount)
				print(len(lineToWrite.split()))
				print("ERROR")
				print(lineToWrite)
			'''
    cleanedFile.close()
    return vocab, word2Index, relation2Id

Exemple #19

0

Afficher le fichier

Fichier : model_v1.py Projet : nikhilxb/cs224u-project

        processed = [process(piece) for piece in pieces]
        return torch.stack(processed,
                           dim=2)  # (batch_size, output_dim, num_pieces)


if __name__ == "__main__":
    # Test PiecewiseCNN
    c1 = torch.rand(
        3, 4, 7, requires_grad=True)  # (batch_size, embed_size, sequence_len)
    c2 = torch.rand(3, 4, 2, requires_grad=True)
    pcnn = PiecewiseCNN(4, output_dim=10)
    out = pcnn(c1, c2)  # (batch_size, output_dim, num_pieces)
    print("PiecewiseCNN test:")
    print("--- out.size() == (3, 10, 2):", out.size() == (3, 10, 2))
    print()

    # Test RelationClassifier
    vocab = utils.glove2dict(
        "data/glove.6B.50d.txt")  # dict[word] -> numpy array(embed_dim,)
    rc = RelationClassifier(vocab, 50)
    X = [(["first", "piece"], ["second", "piece"], ["third", "piece"])]
    y = [0]
    print("RelationClassifier test:")
    print(
        "--- _assemble_vec_seq:\n",
        rc._assemble_vec_seq(['apple', 'banana', 'coconut', 'durian',
                              'apple'])[:, :3])
    out = rc(X, y)
    print("--- forward:\n", out.size())

Exemple #20

0

Afficher le fichier

Fichier : vocab.py Projet : ankurdhoot/CS224U

def load_glove50():
    glove_src = os.path.join(GLOVE_HOME, 'glove.6B.100d.txt')
    # Creates a dict mapping strings (words) to GloVe vectors:
    GLOVE = utils.glove2dict(glove_src)
    print("The number of items in glove is %d" % len(GLOVE))
    return GLOVE

Exemple #21

0

Afficher le fichier

Fichier : kmeans_plus.py Projet : sixkkdrx3/cs224u

    train_filename = sys.argv[1]
    f_train = open(train_filename, 'r')

    cnt = 0
    X = np.zeros(shape=(total_words, total_songs))
    for line in f_train:
        args = line.split(',')
        row = np.array(args[1:])
        X[:, cnt] = row
        cnt += 1
        if cnt % 1000 == 0:
            print cnt

    X = tfidf(X)
    glove_home = 'glove.6B'
    GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.100d.txt'))

    Y = np.zeros(shape=(total_songs, 100))
    word_list = extract_top_words()
    for i in range(total_songs):
        order = np.argsort(X[:, i])
        words = []
        for j in range(top_words):
            words.append(word_list[order[j]])

        allvecs = np.array([GLOVE[w] for w in words if w in GLOVE])
        feature = np.sum(allvecs, axis=0)

        Y[i, :] = feature

    k_means = cluster.KMeans(n_clusters=10, n_init=4)

Exemple #22

0

Afficher le fichier

Fichier : test_utils.py Projet : zijwang/cs224u

def test_glove2dict():
    src_filename = os.path.join("data", "glove.6B", "glove.6B.50d.txt")
    data = utils.glove2dict(src_filename)
    assert len(data) == 400000

Exemple #23

0

Afficher le fichier

def test_glove2dict():
    src_filename = os.path.join("vsmdata", "glove.6B", "glove.6B.50d.txt")
    utils.glove2dict(src_filename)

Exemple #24

0

Afficher le fichier

# In[11]:

wn_edges = get_wordnet_edges()

# ### Reproducing the WordNet synonym graph experiment

# For our VSM, let's use the 300d file included in this distribution from the GloVe team, as it is close to or identical to the one used in the paper:
#
# http://nlp.stanford.edu/data/glove.6B.zip
#
# If you download this archive, place it in `vsmdata`, and unpack it, then the following will load the file into a dictionary for you:

# In[12]:

glove_dict = utils.glove2dict(
    os.path.join(data_home, 'glove.6B', 'glove.6B.300d.txt'))

# This is the initial embedding space $\widehat{Q}$:

# In[13]:

X_glove = pd.DataFrame(glove_dict).T

# In[14]:

X_glove.T.shape

# Now we just need to replace all of the strings in `edges` with indices into `X_glove`:

# In[15]:

Exemple #25

0

Afficher le fichier

 def __init__(self, dropout_prob=0.7, **kwargs):
     self.dropout_prob = dropout_prob
     self.glove_dim = 100
     glove_src = os.path.join(GLOVE_HOME, 'glove.6B.{}d.txt'.format(self.glove_dim))
     self.glove_dict = utils.glove2dict(glove_src)
     super().__init__(**kwargs)

Exemple #26

0

Afficher le fichier

Fichier : kmeans_plus.py Projet : sixkkdrx3/cs224u

    train_filename = sys.argv[1]
    f_train = open(train_filename, 'r')

    cnt = 0
    X = np.zeros(shape=(total_words, total_songs))
    for line in f_train:
        args = line.split(',')
        row = np.array(args[1:])
        X[:, cnt] = row
        cnt += 1
        if cnt % 1000 == 0:
            print cnt

    X = tfidf(X)
    glove_home = 'glove.6B'
    GLOVE = utils.glove2dict(os.path.join(glove_home, 'glove.6B.100d.txt'))

    Y = np.zeros(shape=(total_songs, 100))
    word_list = extract_top_words()
    for i in range(total_songs):
        order = np.argsort(X[:, i])
        words = []
        for j in range(top_words):
            words.append(word_list[order[j]])
        
        allvecs = np.array([GLOVE[w] for w in words if w in GLOVE])
        feature = np.sum(allvecs, axis=0)

        Y[i,:] = feature

Exemple #27

0

Afficher le fichier

def run():
    # train_dataset = NYT10Dataset('data/small_train.txt', 'data/relation2id.txt')
    # val_dataset = NYT10Dataset('data/small_val.txt', 'data/relation2id.txt')
    train_dataset = NYT10Dataset('data/train.txt', 'data/relation2id.txt')
    val_dataset = NYT10Dataset('data/val.txt', 'data/relation2id.txt')
    # test_dataset = NYT10Dataset('data/test.txt', 'data/relation2id.txt')

    # Use GPU if available, otherwise stick with cpu
    use_cuda = torch.cuda.is_available()
    torch.manual_seed(123)
    device = torch.device('cuda' if use_cuda else 'cpu')
    print(device)

    # if use_parallel:
    #     print("[Using all the available GPUs]")
    #     inception = nn.DataParallel(inception, device_ids=[0])

    vocab = utils.glove2dict("data/glove.6B.50d.txt")  # dict[word] -> numpy array(embed_dim,)
    rc_model = RelationClassifier(vocab, 50, train_dataset.num_relations(), device=device).to(device)
    rc_model.apply(init_weights)

    def collate_fn(batch):
        X, y = zip(*batch)
        return X, torch.LongTensor(y)

    trainset_loader = DataLoader(train_dataset,
                                batch_size=50,
                                shuffle=True,
                                num_workers=20,
                                collate_fn=collate_fn)
    valset_loader = DataLoader(val_dataset,
                                batch_size=50,
                                shuffle=False,
                                num_workers=10,
                                collate_fn=collate_fn)

    best_model_filepath = 'models/model_best.weighted.1e-1.pth.tar'
    stats_filepath = 'train_log.txt'

    dataloaders = {'train': trainset_loader, 'val': valset_loader}
    dataset_sizes = {'train': len(train_dataset), 'val': len(val_dataset)}
    loss_weights = torch.ones(train_dataset.num_relations(), device=device)
    loss_weights[0] = 1e-1
    # loss_weights[48] = 1e-2
    criterion = nn.CrossEntropyLoss(weight=loss_weights)
    optimizable_params = [param for param in rc_model.parameters() if param.requires_grad]
    optimizer = torch.optim.Adam(optimizable_params, lr=0.01)
    exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    num_epochs = 20

    # load_saved_model(best_model_filepath, rc_model, optimizer)
    # best_model = rc_model

    best_model = train_model(rc_model,
                                dataloaders,
                                dataset_sizes,
                                criterion,
                                optimizer,
                                exp_lr_scheduler,
                                use_cuda,
                                best_model_filepath,
                                num_epochs)


    predictions = evaluate_model(best_model, valset_loader, len(val_dataset), use_cuda)
    true_y = [y for _, y in val_dataset]
    report = classification_report(true_y, predictions)
    with open(stats_filepath, 'a') as f:
        f.write(report)
    print(report)