Esempio n. 1
0
def train(args):
    tf.set_random_seed(19)
    tf_config = tf.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
        model = SkipGram(sess=sess, **args['dataset'], **args['model'], **args)
        model.train()
Esempio n. 2
0
def train_skipgram():
    losses = []
    loss_fn = nn.MSELoss()
    model = SkipGram(vocab_size, embed_size)
    print(model)
    print('vocab_size:', vocab_size)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    skipgram_train_data = create_skipgram_dataset(text)

    model.train()
    for epoch in range(n_epoch):
        total_loss = .0

        for in_w, out_w, target in skipgram_train_data:
            in_w_var = Variable(torch.LongTensor([w2i[in_w]]))
            out_w_var = Variable(torch.LongTensor([w2i[out_w]]))

            model.zero_grad()
            log_probs = model(in_w_var, out_w_var)
            loss = loss_fn(log_probs[0], Variable(torch.Tensor([target])))
            loss.backward()
            optimizer.step()
            total_loss += float(loss)
        losses.append(total_loss)
    return model, losses
Esempio n. 3
0
class word2vec():
    def __init__(self, mode, vocab_dim, embed_dim, sparse):
        self.mode = mode
        if self.mode == 'cbow':
            self.model = CBOW(vocab_dim, embed_dim, sparse)
        elif self.mode == 'skip-gram':
            self.model = SkipGram(vocab_dim, embed_dim, sparse)
        
    def train(self, training_data, num_epochs=3, learning_rate=0.025):
        # Upload Model to GPU
        device = torch.device('cuda:0')
        self.model.to(device)
        
        # Set Optimizer and Linear Scheduler
        optimizer = optim.SGD(self.model.parameters(), lr=learning_rate)
        scheduler_1 = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma = 2/3)
        scheduler_2 = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma = 1/2)
        
        # Set Loss Function
        loss_function = nn.NLLLoss()
        
        # Train
        for epoch in range(num_epochs):
            print('Epoch {} Started...'.format(epoch+1))
            for i, (X, y) in tqdm(enumerate(training_data)):
                if X.nelement() != 0:
                    X, y = X.to(device), y.to(device)
                    optimizer.zero_grad()
                    loss = loss_function(self.model.forward(X), y)
                    loss.backward()
                    optimizer.step()
                    if i%50000 == 0:
                        print('Iteration : {}, Loss : {:.6f}'.format(i, loss.item()))
            if epoch == 0:
                scheduler_1.step()
            elif epoch == 1:
                scheduler_2.step()
Esempio n. 4
0
def main():
    train_set = PTBDataSet()
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    device = torch.device('cuda')
    model = SkipGram(train_set.get_token_num(), embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, (center, context_negative, mask, label) in enumerate(train_loader):
            center, context_negative, mask, label = center.to(device), context_negative.to(device), mask.to(
                device), label.to(device)
            criteon = nn.BCEWithLogitsLoss(weight=mask.double(), reduction='none').to(device)
            # pred: [batch_size, max_len]
            pred = model(center, context_negative)
            loss = torch.sum(torch.sum(criteon(pred.double(), label.double()), dim=1) / torch.sum(mask.double(), dim=1))
            total_loss += loss.item()
            if batch_idx % 200 == 0:
                print(f'epoch {epoch+1} batch {batch_idx} loss {loss.item()/pred.shape[0]}')
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f'-->epoch {epoch+1} average loss {total_loss/train_set.__len__()}')

    model.get_topk_similar_tokens('chip', train_set.index_to_token, train_set.token_to_index, device, show_top_k)
Esempio n. 5
0
    def train(self):
        if self.model_name == 'SkipGram':
            model = SkipGram(self.vocabulary_size, self.embedding_dim)
        elif self.model_name == 'CBOW':
            return

        if torch.cuda.is_available():
            model.cuda()

        optimizer = optim.SGD(model.parameters(), lr=0.2)

        for epoch in range(self.epoch):
            start = time.time()
            self.op.process = True
            batch_num = 0
            batch_new = 0

            while self.op.process:
                pos_u, pos_v, neg_v = self.op.generate_batch(
                    self.windows_size, self.batch_size, self.neg_sample_size)

                pos_u = Variable(torch.LongTensor(pos_u))
                pos_v = Variable(torch.LongTensor(pos_v))
                neg_v = Variable(torch.LongTensor(neg_v))

                if torch.cuda.is_available():
                    pos_u = pos_u.cuda()
                    pos_v = pos_v.cuda()
                    neg_v = neg_v.cuda()

                optimizer.zero_grad()
                loss = model(pos_u, pos_v, neg_v, self.batch_size)
                loss.backward()
                optimizer.step()

                if batch_num % 3000 == 0:
                    end = time.time()
                    print(
                        'epoch,batch = %2d %5d:   pair/sec = %4.2f  loss = %4.3f\r'
                        % (epoch, batch_num,
                           (batch_num - batch_new) * self.batch_size /
                           (end - start), loss.data[0]),
                        end="\n")
                    batch_new = batch_num
                    start = time.time()
                batch_num += 1

        model.save_embeddings(self.op.idx2word, 'word_embdding.txt',
                              torch.cuda.is_available())
def main():
    args = docopt(__doc__)

    embedding_dim = int(args['--dim'])
    max_context = int(args['--max-context'])
    neg_sample_factor = int(args['--neg-sample-factor'])

    batch_size = int(args['--batch'])
    lr = float(args['--lr'])
    epochs = int(args['--epochs'])

    np.random.seed(int(args['--seed']))
    torch.manual_seed(int(args['--seed']))
    torch.cuda.manual_seed_all(int(args['--seed']))
    device = torch.device(int(args['--device']))
    print(f"{device} will be used")
    num_workers = int(args['--num-workers'])
    fpath = args['--file']
    backup_interval = int(args['--backup-interval'])
    dname = args['--dirname']

    dset = FixedLengthContextDataset(fpath, max_context, neg_sample_factor)
    vocabulary_size = dset.num_authors

    # Symmetric vectors are used to compute cosine similarity
    if args['symmetric']:
        model = SymmetricEmbedding(vocabulary_size, embedding_dim)
    # Word2Vec Skip-gram. Unsymmetric vectors are used to compute cosine similarity
    elif args['skipgram']:
        model = SkipGram(vocabulary_size, embedding_dim)

    if dname == None:
        tmp = 'symmetric' if args['symmetric'] else 'skipgram'
        dname = get_dirname(f'embedding_{tmp}')
    else:
        os.makedirs(dname)

    if torch.cuda.is_available():
        model = model.to(device)
    loader = DataLoader(dset, batch_size, num_workers=num_workers)
    train(model, loader, dname, epochs, lr, backup_interval, device)
Esempio n. 7
0
    def train(self, report=True):
        model = SkipGram(self.vocabulary_size, self.embedding_dim)

        loss_list = list()

        if torch.cuda.is_available():
            model.cuda()

        optimizer = optim.SGD(model.parameters(), lr=0.2)

        for epoch in range(self.epoch):

            start = time.time()
            self.data.process = True
            batch_num = 0
            batch_new = 0

            for data_word, data_sentence in self.data_loader():

                optimizer.zero_grad()
                loss = model(data_word) / self.batch_size
                # loss = model(pos_u, pos_v, neg_v, self.batch_size, target, contex, labels)
                loss_list.append(loss)
                loss.backward()
                optimizer.step()

                if report and batch_num % 7 == 0:  # 3000
                    end = time.time()
                    print(
                        'epoch,batch = %2d %5d:   batch_size = %5d  loss = %4.3f\r'
                        % (epoch, batch_num, self.batch_size, loss.item()),
                        end="\n")
                    batch_new = batch_num
                    start = time.time()
                batch_num += 1

        self.showPlot(loss_list, 'Losses')
        model.save_embeddings(self.data.idx2word, 'word_embdding.txt')
Esempio n. 8
0
File: run.py Progetto: kvthr/NLP
print("Load corpus from pickle file...")
with open('nltk_reuters_corpus.pkl', 'rb') as f:
    corpus = pickle.load(f)

print("Building model...")
vocab_size, vocab = get_count_distinct(corpus)
word2idx, idx2word = get_vocab_dicts(vocab_size, vocab)
window_size = 2

# declare a Tensorflow graph
graph = tf.Graph()
with graph.as_default() as g:
    # create an instance of SkipGram model
    model = SkipGram(vocab_size=vocab_size,
                     embedding_dim=128,
                     window_size=window_size,
                     batch_size=16,
                     graph=g)

    # configure GPU options
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True

    # start the Tensorflow session
    with tf.Session(config=sess_config) as sess:

        # writer and saver objects
        writer = tf.summary.FileWriter("./logs")
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        # check for already present checkpoint files
Esempio n. 9
0
s2i = get_pickle('assets/s2i.pkl')
i2s = get_pickle('assets/i2s.pkl')
holdings = pd.read_csv('assets/holdings.csv', index_col=6)

glove_cor_checkpoint = torch.load('assets/model/model_glove_cor.torch')
model_glove = GloVeCor(len(s2i), 300)
model_glove.load_state_dict(glove_cor_checkpoint['state_dict'])
weights = model_glove.embeddings.weight.detach()
np.savetxt('embeddings/glove_cor_tensors.tsv', weights, delimiter='\t')

glove_cov_checkpoint = torch.load('assets/model/model_glove_cov.torch')
model_glove = GloVeCov(len(s2i), 300)
model_glove.load_state_dict(glove_cov_checkpoint['state_dict'])
weights = model_glove.embeddings.weight.detach()
np.savetxt('embeddings/glove_cov_tensors.tsv', weights, delimiter='\t')

skip_checkpoint = torch.load('assets/model/model_skip.torch')
model_skip = SkipGram(len(s2i), 300)
model_skip.load_state_dict(skip_checkpoint['state_dict'])
weights = model_skip.embeddings.weight.detach()
np.savetxt('embeddings/skip_tensors.tsv', weights, delimiter='\t')

selector = [i2s[e] for e in range(len(weights))]
cols = ['Name', 'Sector', 'Industry Group', 'Country', 'Currency']
metadata = holdings.loc[selector, cols]
metadata.to_csv('assets/metadata.tsv', sep='\t')
cols = ['Name', 'Currency']
metadata = holdings.loc[selector, cols]
metadata.to_csv('embeddings/metadata.tsv', sep='\t', index=False)
Esempio n. 10
0
args = parser.parse_args()


tokenize_data = loadData()
int_text, word2idx, idx2word, freq, vocab = prepareData(tokenize_data, args.min_freq)


if args.mode == "train":
	vocab_size = sum([freq[k] for k in freq])
	subsampled_words = subsampling(freq, args.sampling_threshold, vocab, vocab_size, word2idx)
	neg_sample = negativeSampling(freq)
	#print(neg_sample.shape)

	device='cpu'

	model = SkipGram(len(word2idx), args.embed_size, neg_sample).to(device)
	optimizer = optim.Adam(model.parameters(), args.lr)
	epoch = args.epochs
	steps = 0

	for i in range(epoch):

	  for input_words, target_words in loadBatches(subsampled_words, args.batch_size, args.window_size):
	    steps = steps + 1
	  
	    inputs = torch.LongTensor(input_words)
	    targets = torch.LongTensor(target_words)
	    
	    #inputs, targets = inputs.to(device), targets.to(device)
	    loss = model.forward(inputs, targets, inputs.shape[0], 2)
Esempio n. 11
0
 def create_model(self):
     print("Initialize model")
     vocab_size = len(self.word2idx)
     self.model = SkipGram(vocab_size=vocab_size, emb_dim=self.embed_dim).to(self.device)
     self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)
Esempio n. 12
0
class Word2Vec:
    def __init__(self, lang="english",
                 n_epoch=20,
                 batch_size=500,
                 embed_dim=300,
                 window_size=5,
                 neg_sample=10,
                 min_count=5,
                 lr=0.01,
                 report_every=1):

        self.lang = lang
        self.n_epoch = n_epoch
        self.batch_size = batch_size
        self.embed_dim = embed_dim
        self.window_size = window_size
        self.neg_sample = neg_sample
        self.min_count = min_count
        self.lr = lr
        self.report_every = report_every

        self.model, self.optimizer = None, None
        self.batches, self.vocab, self.word2idx, self.idx2word = [], [], [], []

        # check if GPU available
        is_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if is_cuda else "cpu")
        # number of cpu threads for torch
        workers = multiprocessing.cpu_count()
        torch.set_num_threads(workers)

        print("Train session using {}, processor numbers: {}".format(self.device, workers))

    def handle_data(self):
        # get dataset in correct format
        print("Downloading the data")
        train_data, dev_data, test_data = get_data(self.lang)
        # process data for training
        processor = DataProcess(corpus=train_data,
                                batch_size=self.batch_size,
                                neg_sample=self.neg_sample,
                                window_size=self.window_size,
                                min_freq=self.min_count)
        print("Processing data")
        self.batches, self.vocab, self.word2idx, self.idx2word = processor.pipeline()

    def create_model(self):
        print("Initialize model")
        vocab_size = len(self.word2idx)
        self.model = SkipGram(vocab_size=vocab_size, emb_dim=self.embed_dim).to(self.device)
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.lr)

    def train(self):
        print('Start training')
        # print(self.data.gen_batch()[0])
        for epoch in range(self.n_epoch):
            total_loss = 0

            for minibatch in self.batches:
                pos_u = torch.tensor(minibatch[0], dtype=torch.long).to(self.device)
                pos_v = torch.tensor(minibatch[1], dtype=torch.long).to(self.device)
                neg_v = torch.tensor(minibatch[2], dtype=torch.long).to(self.device)

                # print(len(pos_u), len(pos_v), len(neg_v))
                self.optimizer.zero_grad()
                loss = self.model.forward(pos_u, pos_v, neg_v)
                loss.backward()
                self.optimizer.step()

                total_loss += loss.item()

            if ((epoch + 1) % self.report_every) == 0:
                print('epoch: %d, loss: %.4f' % (epoch + 1, total_loss))

    def save_model(self, filepath):
        print("Saved model in {}".format(filepath))
        self.model.save(filepath, self.idx2word)
Esempio n. 13
0
def main(args):
    LongTensor = torch.cuda.LongTensor if args.gpu else torch.LongTensor
    data = get_pickle('assets/dataset.pkl')
    i2s = get_pickle('assets/i2s.pkl')
    dataset = skipDataset(data)
    model = SkipGram(len(i2s), 300)
    if args.gpu:
        model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    losses = []
    epoch_losses = [np.inf, np.inf, np.inf]
    total_n = len(dataset)
    tmplt = "E:{:2d} - i:{:5d}({:4.2f}%) - L:{:5.5f}"
    for epoch in range(args.epoch):
        dataloader = DataLoader(dataset,
                                batch_size=args.bs,
                                collate_fn=collate_fn,
                                shuffle=True)
        model.train()
        losses = []
        for i, batch in enumerate(dataloader):
            center, target = batch
            center = LongTensor(center)
            target = LongTensor(target)
            loss = model(center, target)
            loss.backward()
            optimizer.step()
            model.zero_grad()
            losses.append(loss.data)
            if i % 100 == 0:
                ml = np.mean(losses)
                t = tmplt.format(epoch, i, i * args.bs / total_n * 100, ml)
                print(t)
                losses = []
        model.eval()
        dataloader = DataLoader(dataset,
                                batch_size=args.bs,
                                collate_fn=collate_fn,
                                shuffle=True)
        losses = []
        for i, batch in enumerate(dataloader):
            center, target = batch
            center = torch.LongTensor(center)
            target = torch.LongTensor(target)
            loss = model(center, target)
            losses.append(loss.data)
        epoch_losses.append(np.mean(losses))
        print('Epoch loss {}'.format(epoch_losses[-1]))
        if epoch_losses[-1] > epoch_losses[-4]:
            break
        else:
            filename = 'assets/model/model_skip.torch'
            state = dict(state_dict=model.state_dict(),
                         loss=epoch_losses,
                         args=args)
            torch.save(state, filename)
Esempio n. 14
0
    with open(f'{FILENAME}.vocab') as f:
        vocab = sorted(f.readline().split())
    VOCAB_SIZE = len(vocab)

    # String2int conversion
    words_to_idx = {i: j for j, i in enumerate(vocab)}

    # Create dataset
    train_dataset = WikiDataset(f'{FILENAME}.parsed', 1, words_to_idx)
    train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=BATCH_SIZE,
                                                   num_workers=NUM_WORKERS)

    # Get random sampler
    sampler = NegativeSampler(f'{FILENAME}.parsed.count', words_to_idx)

    for emb_dim in EMBEDDING_DIMS:
        model = SkipGram(VOCAB_SIZE, emb_dim)
        model.to(DEVICE)
        model.device = DEVICE

        optimizer = torch.optim.SparseAdam(model.parameters())

        train(model,
              train_dataloader,
              optimizer,
              sampler,
              VOCAB_SIZE,
              epochs=EPOCHS,
              save_path=f'{SAVE_FOLDER}{emb_dim}_')
Esempio n. 15
0
 def __init__(self, mode, vocab_dim, embed_dim, sparse):
     self.mode = mode
     if self.mode == 'cbow':
         self.model = CBOW(vocab_dim, embed_dim, sparse)
     elif self.mode == 'skip-gram':
         self.model = SkipGram(vocab_dim, embed_dim, sparse)