コード例 #1
0
    def __init__(self, turkCount, addTurkerOneHot):
        super(AdjEmb, self).__init__()

        cwd = os.getcwd()
        path = cwd + "/data/"

        #load a subset of glove which contains embeddings for the adjectives we have
        self.vocab, self.vec = torchwordemb.load_glove_text(
            path + "glove_our_adj.csv")
        self.noOfTurkers = turkCount

        self.embeddings = nn.Embedding(self.vec.shape[0], self.vec.shape[1])
        self.embeddings.weight.data.copy_(self.vec)

        #dont update embeddings
        self.embeddings.weight.requires_grad = False

        self.linear1 = nn.Linear(self.vec.size(1), dense1_size)

        #the last step: whatever the output of previous layer was concatenate it with the mu and sigma and one-hot vector for turker
        if (addTurkerOneHot):
            self.fc = torch.nn.Linear(dense1_size + turkCount + 2, 1)
        else:
            #use this when you dont have one hot for turkers
            self.fc = torch.nn.Linear(dense1_size + 2, 1)
コード例 #2
0
ファイル: tests.py プロジェクト: zmqgeek/pytorch-wordemb
    def test_glove_text(self):
        word, vec = torchwordemb.load_glove_text("resource/glove.test.txt")

        self.assertEqual(len(word), 10)

        self.assertEqual(vec.size(0), 10)
        self.assertEqual(vec.size(1), 300)
コード例 #3
0
def vector_loader_new(text_field_words):
    path = 'word_embedding/glove.sentiment.conj.pretrained.txt'
    words_dict, vec = torchwordemb.load_glove_text(path)
    embed_size = vec.size(1)

    # match
    count_list2 = []
    count = 0
    dict_cat = []
    for word in text_field_words:
        if word in words_dict:
            count += 1
            dict_cat.append(words_dict[word])
        else:
            dict_cat.append([0.0] * embed_size)
            count += 1
            count_list2.append(count - 1)
    count_data = len(text_field_words) - len(count_list2)

    # modify zero
    sum = []
    for j in range(embed_size):
        sum_col = 0.0
        for i in range(len(dict_cat)):
            sum_col += dict_cat[i][j]
            sum_col = float(sum_col / count_data)
            sum_col = round(sum_col, 6)
        sum.append(sum_col)

    for i in range(len(count_list2)):
        dict_cat[count_list2[i]] = sum

    return dict_cat
コード例 #4
0
ファイル: train_LSTM.py プロジェクト: anhtvh/luanvanThS
def read_embed(embed_path, LM):

    if LM == "glove":
        vocab, vec = torchwordemb.load_glove_text(embed_path)
    else:
        vocab, vec = torchwordemb.load_word2vec_text(embed_path)

    return vocab, vec
コード例 #5
0
ファイル: vocab.py プロジェクト: TheaperDeng/SI671-project
 def get_word_index(self, padding_marker='__PADDING__', unknown_marker='__UNK__',):
     _vocab, _vec = torchwordemb.load_glove_text(self.path_glove)
     vocab = {padding_marker:0, unknown_marker:1}
     for tkn, indx in _vocab.items():
         vocab[tkn] = indx + 2
     vec_2 = torch.zeros((2, _vec.size(1)))
     vec_2[1].normal_()
     self.vec = torch.cat((vec_2, _vec))
     self.vocab = vocab
     return self.vocab, self.vec
コード例 #6
0
def load_glove(path, input_dial, glove_dim, load_glove):
    if load_glove:
        voc, vec = torchwordemb.load_glove_text(path)
    else:
        if glove_dim == 100:
            voc = pickle.load(open('./data/voc_100', 'rb'))
            vec = pickle.load(open('./data/vec_100', 'rb'))
        elif glove_dim == 50:
            voc = pickle.load(open('./data/voc', 'rb'))
            vec = pickle.load(open('./data/vec', 'rb'))
    input_dial.glove_voc.update(voc)
    input_dial.glove_vec = vec
コード例 #7
0
ファイル: datasets.py プロジェクト: coli-saar/LSTM_sentiment
    def __init__(self, path, glove_path="./glove.6B.50d.txt"):
        self.reader = filereader.FileReader(path)
        self.pattern = re.compile('[^ \w]+')
        self.userdict = UserDict()

        print("Reading word vectors...")
        self.vocab, self.vec = torchwordemb.load_glove_text(glove_path)

        print("Collecting user IDs ...")
        for i in range(len(self.reader)):
            line = self.reader[i]
            data = json.loads(line)
            user = data["user_id"]
            userid = self.userdict.lookup(user)

        print("Done!")
コード例 #8
0
def main():
    with open('../data/cb-small/instances.jsonl', 'r') as f:
        data = []
        for i in range(300):
            obj = json.loads(f.readline())
            entry = {'targetParagraphs': obj['targetParagraphs']}
            data.append(entry)

    with open('word_vec_test.json', 'w') as f:
        for entry in data:
            f.write(json.dumps(entry) + '\n')
    vocab, emb = torchwordemb.load_glove_text('glove.6B.50d.txt')
    #vocab = None
    #emb = None

    with open('word_vec_test.json', 'r') as f:
        inputs = [json.loads(line) for line in f]

    get_word_ids(inputs, vocab, emb)
コード例 #9
0
                    filename='/media/storage/word_vectors/checkpoint' +
                    args.version + '.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(
            filename, '/media/storage/word_vectors/model_best' + args.version +
            '.pth.tar')


torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

path = args.path_word_vectors

vocab, vec = torchwordemb.load_glove_text(path)
word_vec_dataset = torch.utils.data.TensorDataset(vec, vec)
train_loader = torch.utils.data.DataLoader(word_vec_dataset,
                                           shuffle=True,
                                           batch_size=args.batch_size,
                                           num_workers=10)
test_loader = torch.utils.data.DataLoader(word_vec_dataset,
                                          shuffle=False,
                                          batch_size=args.test_batch_size,
                                          num_workers=10)
inverse_vocab = dict()
for k in vocab.keys():
    inverse_vocab[vocab[k]] = k


class Encoder(nn.Module):
コード例 #10
0
df = pd.read_csv(input_path, sep='\t', header=None, encoding="ISO-8859-1")
train, test = train_test_split(df,
                               test_size=0.2,
                               random_state=RANDOM_SEED,
                               shuffle=True)

#X_train = (train.to_frame().T)
X_train = train[2]
y_train = train[1]
X_test = test[2]
y_test = test[1]

## Load pretrained word vector
if args['dim'] == 50:
    print('loading 50d glove embedding')
    vocab, vec = torchwordemb.load_glove_text(
        "/diskA/animesh/glove/glove.6B.50d.txt")
elif args['dim'] == 100:
    print('loading 100d glove embedding')
    vocab, vec = torchwordemb.load_glove_text(
        "/diskA/animesh/glove/glove.6B.100d.txt")
elif args['dim'] == 200:
    print('loading 200d glove embedding')
    vocab, vec = torchwordemb.load_glove_text(
        "/diskA/animesh/glove/glove.6B.200d.txt")
elif args['dim'] == 300:
    print('loading 300d glove embedding')
    vocab, vec = torchwordemb.load_glove_text(
        "/diskA/animesh/glove/glove.6B.300d.txt")
else:
    print("Embedding dimension not available. Defaulting to 50 dimensions")
    vocab, vec = torchwordemb.load_glove_text(
コード例 #11
0
import utils
import settings
import re
import torchwordemb

from live_sentiment import text2vec

if __name__ == "__main__":
    # Load model
    model = utils.generate_model_from_settings()
    utils.load_model_params(model, settings.args.load_path)

    # Load glove
    print("Reading word vectors...")
    vocab, vec = torchwordemb.load_glove_text(
        settings.DATA_KWARGS["glove_path"])
    print("Done!")

    while True:
        text = input("What's on your mind? \n")
        features = text2vec(text, vocab, vec)
        features = utils.pack_sequence([features])
        (features, lengths) = torch.nn.utils.rnn.pad_packed_sequence(features)
        out = model(features, lengths)

        stars = float(out[0, 0, 0])
        if stars < 1.1:
            print("Watch your language, kid")
        print("Your mind has the following rating: {}".format(stars))
コード例 #12
0
# word embeddings


# preparing ngrams of size n = 6

ngrams = [([train_data[i], train_data[i + 1], train_data[i + 2], train_data[i + 3], train_data[i + 4]], train_data[i + 5])
          for i in range(len(train_data) - 5)]

ngrams_valid = [([valid_data[i], valid_data[i + 1], valid_data[i + 2], valid_data[i + 3], valid_data[i + 4]], valid_data[i + 5])
                for i in range(len(valid_data) - 5)]


# load embeddings
try:
    vocab, vec = torchwordemb.load_glove_text("../embeddings/glove.6b/glove.6B.50d.txt")
except FileNotFoundError:
    vocab, vec = torchwordemb.load_glove_text("./embeddings/glove.6b/glove.6B.50d.txt")

# vocab of treebank
vocab_tb = data.dictionary.word2idx.keys()

# mean  vec of all embeddings
mean_vec = torch.mean(vec, 0).view(1, 50)


# mean vec for digits
numvec = vec[vocab["0"], :].view(1, 50)
numvec = torch.cat((vec[vocab["1"], :].view(1, 50), numvec), 0)
numvec = torch.cat((vec[vocab["2"], :].view(1, 50), numvec), 0)
numvec = torch.cat((vec[vocab["3"], :].view(1, 50), numvec), 0)
コード例 #13
0
def make_dataset(categories,
                 validation,
                 test,
                 cnn_style=False,
                 word2vec=False,
                 train_frac=None):
    """
    Makes a dataset out of the given categories and validation and test
    proportions.

    Args:
        categories: list of strings corresponding to files, e.g. 'dog' for
        file 'dog.npy'.

        validation: float in (0, 1), proportion of validation examples.

        test: float in (0, 1), proportion of test examples.
        
        cnn_style: if True, output shape will be [batch, channel, row, col]

        word2vec: if given, also return an [n, e] array where n is the number
        of classes and e is the dimension of the embedding
        
        train_frac: if provided, only take this fraction of training examples.
    """

    X_train = []
    y_train = []
    X_valid = []
    y_valid = []
    X_test = []
    y_test = []
    for i, cat in enumerate(categories):
        images = load_images(cat + ".npy", cnn_style=cnn_style)

        num_test = int(test * len(images))
        num_valid = int(validation * len(images))
        num_train = len(images) - num_valid - num_test

        X_train.extend(images[:num_train])
        X_valid.extend(images[num_train:(num_train + num_valid)])
        X_test.extend(images[(num_train + num_valid):])

        y_train.extend([i] * num_train)
        y_valid.extend([i] * num_valid)
        y_test.extend([i] * num_test)

    def shuffled_arrays(X, y):
        X = np.array(X)
        y = np.array(y)
        assert (len(X) == len(y))
        order = np.arange(len(X))
        np.random.shuffle(order)
        return X[order], y[order]

    X_train, y_train = shuffled_arrays(X_train, y_train)
    X_valid, y_valid = shuffled_arrays(X_valid, y_valid)
    X_test, y_test = shuffled_arrays(X_test, y_test)

    mean = X_train.mean()
    std = X_train.std()

    X_train = (X_train - mean) / std
    X_valid = (X_valid - mean) / std
    X_test = (X_test - mean) / std

    if train_frac:
        num_train = int(train_frac * len(X_train))
        X_train = X_train[:num_train]
        y_train = y_train[:num_train]

    if word2vec:
        vocab, vec = torchwordemb.load_glove_text("glove.6B.100d.txt")
        emb = np.zeros((len(categories), 100))
        for i, cat in enumerate(categories):
            pos = vocab[cat]
            emb[i] = vec[pos]
        return X_train, y_train, X_valid, y_valid, X_test, y_test, emb
    else:
        return X_train, y_train, X_valid, y_valid, X_test, y_test
コード例 #14
0
ファイル: datasets.py プロジェクト: netrome/LSTM_sentiment
 def __init__(self, path, glove_path="./glove.6B.50d.txt"):
     self.reader = filereader.FileReader(path)
     self.pattern = re.compile('[^ \w]+')
     print("Reading word vectors...")
     self.vocab, self.vec = torchwordemb.load_glove_text(glove_path)
     print("Done!")
コード例 #15
0
def load_glove_vecs(path):
    vocab, vec = torchwordemb.load_glove_text(path)
    return vocab, vec