Example #1
0
 def __init__(self, inputfile, pre_trained_vocab_reverse = {}, pre_trained_vocab = {}, vocabulary_size=300000, embedding_dim=200, epoch_num=5, batch_size=16, windows_size=5,neg_sample_num=10):
   self.op = Options(inputfile, pre_trained_vocab_reverse, pre_trained_vocab, vocabulary_size)
   self.embedding_dim = embedding_dim
   self.windows_size = windows_size
   self.vocabulary_size = len(self.op.vocab_words)
   self.batch_size = batch_size
   self.epoch_num = epoch_num
   self.neg_sample_num = neg_sample_num
Example #2
0
 def __init__(self, inputfile, vocabulary_size=100000, embedding_dim=200, epoch_num=10, batch_size=16, windows_size=5,neg_sample_num=10):
   self.op = Options(inputfile, vocabulary_size)
   self.embedding_dim = embedding_dim
   self.windows_size = windows_size
   self.vocabulary_size = vocabulary_size
   self.batch_size = batch_size
   self.epoch_num = epoch_num
   self.neg_sample_num = neg_sample_num
Example #3
0
 def __init__(self, inputfile, embeddingsfile, reg_lambda=0.000001, embedding_dim=300, epoch_num=5, batch_size=16, windows_size=5, neg_sample_num=10):
   self.op = Options(inputfile, embeddingsfile, embedding_dim)
   self.windows_size = windows_size
   self.vocabulary_size = self.op.vocabulary_size
   self.batch_size = batch_size
   self.epoch_num = epoch_num
   self.neg_sample_num = neg_sample_num
   self.reg_lambda = reg_lambda
Example #4
0
 def __init__(self,
              embedsfile,
              netfile,
              netlist,
              rellist,
              n_negative,
              testmodel,
              lamtas=[1, 1, 8, 8]):
     self.op = Options(embedsfile, netfile, netlist, rellist, lamtas,
                       testmodel)
     self.neg_generator = UniformNegativeGenerator(self.op.vocab_size,
                                                   self.op.sample_table,
                                                   n_negative=n_negative)
     self.n_negative = n_negative
     print('Inialize Finish')
Example #5
0
class word2vec:
  def __init__(self, inputfile, embeddingsfile, reg_lambda=0.000001, embedding_dim=300, epoch_num=5, batch_size=16, windows_size=5, neg_sample_num=10):
    self.op = Options(inputfile, embeddingsfile, embedding_dim)
    self.windows_size = windows_size
    self.vocabulary_size = self.op.vocabulary_size
    self.batch_size = batch_size
    self.epoch_num = epoch_num
    self.neg_sample_num = neg_sample_num
    self.reg_lambda = reg_lambda

  def train(self):
    model = skipgram(self.vocabulary_size, self.op.embeddings.shape[1], reg=self.reg_lambda)
    model.init_emd(self.op.embeddings)
 
    if torch.cuda.is_available():
      model.cuda()
    optimizer = optim.SGD(model.parameters(),lr=0.2)
    for epoch in range(self.epoch_num):
      start = time.time()     
      self.op.process = True
      batch_num = 0
      batch_new = 0

      while self.op.process:
        pos_u, pos_v, neg_v = self.op.generate_batch(self.windows_size, self.batch_size, self.neg_sample_num)

        pos_u = Variable(torch.LongTensor(pos_u))
        pos_v = Variable(torch.LongTensor(pos_v))
        neg_v = Variable(torch.LongTensor(neg_v))


        if torch.cuda.is_available():
          pos_u = pos_u.cuda()
          pos_v = pos_v.cuda()
          neg_v = neg_v.cuda()

        optimizer.zero_grad()
        loss = model(pos_u, pos_v, neg_v,self.batch_size)

        loss.backward()
   
        optimizer.step()

        if batch_num%30000 == 0:
          torch.save(model.state_dict(), './tmp/skipgram.epoch{}.batch{}'.format(epoch,batch_num))

        if batch_num%2000 == 0:
          end = time.time()
          word_embeddings = model.input_embeddings()
          sp1, sp2 = scorefunction(word_embeddings)     
          print 'epoch,batch=%2d %5d: sp=%1.3f %1.3f  pair/sec = %4.2f loss=%4.3f' %(epoch, batch_num, sp1, sp2, (batch_num-batch_new)*self.batch_size/(end-start),loss.data[0])
          batch_new = batch_num
          start = time.time()
        batch_num = batch_num + 1 
      print()
    print("Optimization Finished!")
    model.save_embedding("embeddings.txt", lambda x: self.op.vocab_words[x])
Example #6
0
 def __init__(self,
              inputfile,
              val_fn,
              vocabulary_size=4000,
              embedding_dim=100,
              epoch_num=2,
              batch_size=16,
              windows_size=5,
              neg_sample_num=10):
     logger = logging.getLogger()
     logger.info("Load train data")
     self.op = Options(inputfile, vocabulary_size)
     logger.info("Load test data")
     self.val_op = Options(val_fn,
                           vocabulary_size,
                           dictionary=self.op.dictionary)
     self.embedding_dim = embedding_dim
     self.windows_size = windows_size
     self.vocabulary_size = vocabulary_size
     self.batch_size = batch_size
     self.epoch_num = epoch_num
     self.neg_sample_num = neg_sample_num
Example #7
0
class ComEmb(object):
    def __init__(self,
                 embedsfile,
                 netfile,
                 netlist,
                 rellist,
                 n_negative,
                 testmodel,
                 lamtas=[1, 1, 8, 8]):
        self.op = Options(embedsfile, netfile, netlist, rellist, lamtas,
                          testmodel)
        self.neg_generator = UniformNegativeGenerator(self.op.vocab_size,
                                                      self.op.sample_table,
                                                      n_negative=n_negative)
        self.n_negative = n_negative
        print('Inialize Finish')

    def train(self,
              gamma=1,
              l2=1e-3,
              epoch_num=400,
              batch_size=32,
              embedding_dim=300,
              lr=0.01,
              emnames='w2v',
              sname='geo'):
        embeds = self.op.embeds
        dims = self.op.dims
        vocab_size = self.op.vocab_size
        rel_size = self.op.rel_size
        id2word = self.op.id2word
        id2rel = self.op.id2rel
        wordindex = self.op.wordindex
        triples_id = self.op.subsampled_data
        oovs = self.op.oovs
        mean_score = self.op.meanscore
        batch_num = math.ceil(len(triples_id) / batch_size)

        print('dims: ' + str(dims))
        print('learning rate:' + str(lr))
        print('gamma: ' + str(gamma))
        print('batch_num:' + str(batch_num))
        print('mean score:' + str(mean_score))

        model = ensemble(vocab_size, rel_size, embedding_dim, embeds, gamma,
                         l2, dims)
        optimizer = optim.SGD(filter(lambda p: p.requires_grad,
                                     model.parameters()),
                              lr=lr)

        if torch.cuda.is_available():
            model = model.cuda()

        # note = open('note.txt', 'w')
        # note.write('gamma= {}\n'.format(gamma))
        # scheduler = MultiStepLR(optimizer, milestones=[50,90,120], gamma=0.5)

        for t in range(epoch_num):
            #scheduler.step()
            batch_num = 0
            for pos_triplets in self.op.batch_iter(triples_id, batch_size):
                neg_triplets, neg_ents = self.neg_generator.generate(
                    pos_triplets)
                inp = np.concatenate(
                    (pos_triplets[:, 0], pos_triplets[:, 2], neg_ents))

                pos_triplets = np.tile(pos_triplets, (self.n_negative, 1))
                weight = pos_triplets[:, 3]

                weight = Variable(torch.FloatTensor(weight))
                inp = Variable(torch.LongTensor(inp))
                pos_triplets = Variable(torch.LongTensor(pos_triplets))
                neg_triplets = Variable(torch.LongTensor(neg_triplets))

                if torch.cuda.is_available():
                    weight = weight.cuda()
                    inp = inp.cuda()
                    pos_triplets = pos_triplets.cuda()
                    neg_triplets = neg_triplets.cuda()

                optimizer.zero_grad()
                loss, sEmb, sGraph = model(inp, pos_triplets, neg_triplets,
                                           weight, mean_score)

                loss.backward()
                optimizer.step()
                # if batch_num==1000:
                # 	note.write('epoch %2d batch %2d sp=%1.3f %1.3f %1.3f %1.3f %1.3f %1.3f %1.3f loss=%2.5f sEmb=%2.5f sGraph=%2.5f \n'%(t,batch_num,sp1, sp2, sp3, sp4,sp5, sp6,sp7, loss.data[0], sEmb.data[0], sGraph.data[0]))

                if batch_num % 100 == 0:
                    word_embeddings = model.metaemb.weight.data.cpu().numpy()
                    sp1, sp2, sp3, sp4, sp5, sp6, sp7 = scorefunction(
                        wordindex, word_embeddings)
                    print(
                        'epoch %2d batch %2d sp=%1.3f %1.3f %1.3f %1.3f %1.3f %1.3f %1.3f loss=%2.5f sEmb=%2.5f sGraph=%2.5f \r'
                        % (t, batch_num, sp1, sp2, sp3, sp4, sp5, sp6, sp7,
                           loss.data[0], sEmb.data[0], sGraph.data[0]),
                        end="")
                batch_num = batch_num + 1

            word_embeddings = model.metaemb.weight.data.cpu().numpy()
            sp1, sp2, sp3, sp4, sp5, sp6, sp7 = scorefunction(
                wordindex, word_embeddings)
            print(
                'epoch=%2d sp=%1.3f %1.3f %1.3f %1.3f %1.3f %1.3f %1.3f loss=%2.5f sEmb=%2.5f sGraph=%2.5f \r'
                % (t, sp1, sp2, sp3, sp4, sp5, sp6, sp7, loss.data[0],
                   sEmb.data[0], sGraph.data[0]),
                end="")

        print(
            't=%2d  sp=%1.3f %1.3f %1.3f %1.3f %1.3f %1.3f %1.3f loss=%7.2f' %
            (t, sp1, sp2, sp3, sp4, sp5, sp6, sp7, loss.data[0]))

        fo = open('Trans_multi_loss1_gamma{}'.format(gamma), 'w')
        for k in range(len(word_embeddings[:-1])):
            emb = word_embeddings[k]
            emb = [str(i) for i in emb]
            fo.write(id2word[k] + ' ' + ' '.join(emb) + '\n')
Example #8
0
class word2vec:
  def __init__(self, inputfile, pre_trained_vocab_reverse = {}, pre_trained_vocab = {}, vocabulary_size=300000, embedding_dim=200, epoch_num=5, batch_size=16, windows_size=5,neg_sample_num=10):
    self.op = Options(inputfile, pre_trained_vocab_reverse, pre_trained_vocab, vocabulary_size)
    self.embedding_dim = embedding_dim
    self.windows_size = windows_size
    self.vocabulary_size = len(self.op.vocab_words)
    self.batch_size = batch_size
    self.epoch_num = epoch_num
    self.neg_sample_num = neg_sample_num


  def train(self, pre_trained_model):

    model = skipgram(self.vocabulary_size, self.embedding_dim, pre_trained_model)
    if torch.cuda.is_available():
      model.cuda()
    optimizer = optim.SGD(model.parameters(),lr=0.2)
    loss_history = list()
    for epoch in range(self.epoch_num):
      start = time.time()     
      self.op.process = True
      batch_num = 0
      batch_new = 0

      while self.op.process:
        pos_u, pos_v, neg_v = self.op.generate_batch(self.windows_size, self.batch_size, self.neg_sample_num)

        pos_u = Variable(torch.LongTensor(pos_u))
        pos_v = Variable(torch.LongTensor(pos_v))
        neg_v = Variable(torch.LongTensor(neg_v))


        if torch.cuda.is_available():
          pos_u = pos_u.cuda()
          pos_v = pos_v.cuda()
          neg_v = neg_v.cuda()

        optimizer.zero_grad()
        loss = model(pos_u, pos_v, neg_v,self.batch_size)

        loss.backward()
   
        optimizer.step()

        if batch_num % 10 == 0:
          loss_history.append(loss.data[0])

        if batch_num % 2000 == 0:
          end = time.time()
          word_embeddings = model.input_embeddings()
          # sp1, sp2 = scorefunction(word_embeddings)
          print('epoch,batch=%2d %5d:  pair/sec = %4.2f loss=%4.3f\r', \
                epoch, batch_num, (batch_num - batch_new) * self.batch_size / (end - start), loss.data[0])
          batch_new = batch_num
          start = time.time()
        batch_num = batch_num + 1
      print()
      torch.save(model.state_dict(), __location__ + '/skipgram.epoch{}.batch{}'.format(epoch, batch_num))

    plt.plot(loss_history[::100])
    plt.ylabel('loss (stat.ML)')
    plt.show()
    print("Optimization Finished!")
Example #9
0
class word2vec:
    def __init__(self, inputfile, vocabulary_size=100000, embedding_dim=100, epoch_num=10,
                 batch_size=32, windows_size=4, neg_sample_num=7):
        self.op = Options(inputfile, vocabulary_size)
        self.embedding_dim = embedding_dim
        self.windows_size = windows_size
        self.vocabulary_size = vocabulary_size
        self.batch_size = batch_size
        self.epoch_num = epoch_num
        self.neg_sample_num = neg_sample_num

    def train(self):
        model = skipgram(self.vocabulary_size, self.embedding_dim)
        if torch.cuda.is_available():
            model.cuda()
        optimizer = optim.SGD(model.parameters(), lr=0.2)
        for epoch in range(self.epoch_num):
            start = time.time()
            self.op.process = True
            batch_num = 0
            batch_new = 0

            while self.op.process:
                pos_u, pos_v, neg_v = self.op.generate_batch(self.windows_size, self.batch_size, self.neg_sample_num)

                pos_u = Variable(torch.LongTensor(pos_u))
                pos_v = Variable(torch.LongTensor(pos_v))
                neg_v = Variable(torch.LongTensor(neg_v))

                if torch.cuda.is_available():
                    pos_u = pos_u.cuda()
                    pos_v = pos_v.cuda()
                    neg_v = neg_v.cuda()

                optimizer.zero_grad()
                loss = model(pos_u, pos_v, neg_v, self.batch_size)

                loss.backward()

                optimizer.step()

                if batch_num % 30000 == 0:
                    torch.save(model.state_dict(), './tmp/skipgram.epoch{}.batch{}'.format(epoch, batch_num))

                if batch_num % 1000 == 0:
                    end = time.time()
                    # word_embeddings = model.input_embeddings()
                    ## sp1 and sp2 based in distinct words
                    # sp1, sp2 = scorefunction(word_embeddings)
                    ## loss,data[0] to loss.data
                    # print('eporch,batch=%2d %5d: sp=%1.3f %1.3f  pair/sec = %4.2f loss=%4.3f\r' \
                    #       % (epoch, batch_num, sp1, sp2, (batch_num - batch_new) * self.batch_size / (end - start),
                    #          loss.data), end="")
                    print('eporch,batch=%2d %5d: pair/sec = %4.2f loss=%4.3f\r' \
                          % (epoch, batch_num,
                             (batch_num - batch_new) * self.batch_size / (end - start),
                             loss.data), end="")
                    batch_new = batch_num
                    start = time.time()
                    print()
                batch_num = batch_num + 1
            # saving each epoch
            # bell
            print('\a')
            model.save_embedding(os.path.join("data",
                                              "embed_epoch_" + str(epoch) + ".vec"),
                                 self.op.dic_idx2word)
            print()
        print("Optimization Finished!")
Example #10
0
class word2vec:
    def __init__(self,
                 inputfile,
                 vocabulary_size=100000,
                 embedding_dim=200,
                 epoch_num=64,
                 batch_size=256,
                 windows_size=5,
                 neg_sample_num=10):
        self.op = Options(inputfile, vocabulary_size)
        self.embedding_dim = embedding_dim
        self.windows_size = windows_size
        self.vocabulary_size = vocabulary_size
        self.batch_size = batch_size
        self.epoch_num = epoch_num
        self.neg_sample_num = neg_sample_num

# pylint: disable=missing-docstring
# Function to draw visualization of distance between embeddings.

    def plot_with_labels(self, low_dim_embs, labels, filename):
        assert low_dim_embs.shape[0] >= len(
            labels), 'More labels than embeddings'
        plt.figure(figsize=(18, 18))  # in inches
        for i, label in enumerate(labels):
            x, y = low_dim_embs[i, :]
            plt.scatter(x, y)
            plt.annotate(label,
                         xy=(x, y),
                         xytext=(5, 2),
                         textcoords='offset points',
                         ha='right',
                         va='bottom')

        plt.savefig(filename)
        plt.show()

    def train(self):
        # cudnn.benchmark = True
        model = skipgram(self.vocabulary_size, self.embedding_dim)
        if torch.cuda.is_available():
            print("using cuda")
            model.cuda()
        else:
            print("not using cuda")

        optimizer = optim.SGD(model.parameters(), lr=0.2)
        for epoch in range(self.epoch_num):
            start = time.time()
            self.op.process = True
            batch_num = 0
            batch_new = 0
            while self.op.process:
                pos_u, pos_v, neg_v = self.op.generate_batch(
                    self.windows_size, self.batch_size, self.neg_sample_num)

                pos_u = Variable(torch.LongTensor(pos_u))
                pos_v = Variable(torch.LongTensor(pos_v))
                neg_v = Variable(torch.LongTensor(neg_v))

                if torch.cuda.is_available():
                    pos_u = pos_u.cuda()
                    pos_v = pos_v.cuda()
                    neg_v = neg_v.cuda()

                optimizer.zero_grad()
                loss = model(pos_u, pos_v, neg_v, self.batch_size)
                loss.backward()

                optimizer.step()

                if batch_num % 30000 == 0:
                    torch.save(
                        model.state_dict(),
                        './tmp/skipgram.epoch{}.batch{}'.format(
                            epoch, batch_num))

                if batch_num % 2000 == 0:
                    end = time.time()
                    word_embeddings = model.input_embeddings()
                    # sp1, sp2 = scorefunction(word_embeddings)
                    print('epoch,batch=%2d %5d:  pair/sec = %4.2f loss=%4.3f\r'\
                    %(epoch, batch_num, (batch_num-batch_new)*self.batch_size/(end-start),loss.data),end="")
                    batch_new = batch_num
                    start = time.time()
                batch_num = batch_num + 1
            print()

        tsne = TSNE(perplexity=30, n_components=2, init='random', n_iter=5000)
        embeds = model.u_embeddings.weight.data
        labels = []
        tokens = []
        max_size = 1000
        for idx in range(min(len(embeds), len(self.op.vocab_words), max_size)):
            tokens.append(embeds[idx].cpu().numpy())
            labels.append(self.op.vocab_words[idx])
        pca = PCA(n_components=50)
        pca_result = pca.fit_transform(tokens)
        low_dim_embs = tsne.fit_transform(pca_result)
        self.plot_with_labels(low_dim_embs, labels, 'tsne.png')

        print("Optimization finished!")
Example #11
0
class word2vec:
    def __init__(self,
                 inputfile,
                 vocabulary_size=1200000,
                 embedding_dim=300,
                 epoch_num=20,
                 batch_size=128,
                 windows_size=5,
                 neg_sample_num=5):
        self.op = Options(inputfile, vocabulary_size)
        self.embedding_dim = embedding_dim
        self.windows_size = windows_size
        self.vocabulary_size = vocabulary_size
        self.batch_size = batch_size
        self.epoch_num = epoch_num
        self.neg_sample_num = neg_sample_num

    def train(self):
        model = skipgram(self.vocabulary_size, self.embedding_dim)
        if torch.cuda.is_available():
            print("CUDA available")
            model.cuda()
        else:
            print("CUDA NOT available")
        optimizer = optim.SGD(model.parameters(), lr=0.2)
        for epoch in range(self.epoch_num):
            start = time.time()
            self.op.process = True
            batch_num = 0
            batch_new = 0

            while self.op.process:
                pos_u, pos_v, neg_v = self.op.generate_batch(
                    self.windows_size, self.batch_size, self.neg_sample_num)

                pos_u = Variable(torch.LongTensor(pos_u))
                pos_v = Variable(torch.LongTensor(pos_v))
                neg_v = Variable(torch.LongTensor(neg_v))

                if torch.cuda.is_available():
                    pos_u = pos_u.cuda()
                    pos_v = pos_v.cuda()
                    neg_v = neg_v.cuda()

                optimizer.zero_grad()
                loss = model(pos_u, pos_v, neg_v, self.batch_size)

                loss.backward()

                optimizer.step()

                #if batch_num%30000 == 0:
                #torch.save(model.state_dict(), './tmp/skipgram.epoch{}.batch{}'.format(epoch,batch_num))

                if batch_num % 2000 == 0:
                    end = time.time()
                    word_embeddings = model.input_embeddings()
                    sp1, sp2, num = scorefunction(word_embeddings)
                    print(
                        'eporch,batch=%2d %5d: sp=%1.3f %1.3f %3d pair/sec = %4.2f loss=%4.3f\r'
                        % (epoch, batch_num, sp1, sp2, num,
                           (batch_num - batch_new) * self.batch_size /
                           (end - start), loss.data[0]),
                        end="")
                    batch_new = batch_num
                    start = time.time()
                batch_num = batch_num + 1
            print()
        print("Optimization Finished!")
        model.save_embedding('embed_en_120w_128_ch.txt', self.op.vocab_words)
Example #12
0
class word2vec:
    def __init__(self,
                 inputfile,
                 val_fn,
                 vocabulary_size=4000,
                 embedding_dim=100,
                 epoch_num=2,
                 batch_size=16,
                 windows_size=5,
                 neg_sample_num=10):
        logger = logging.getLogger()
        logger.info("Load train data")
        self.op = Options(inputfile, vocabulary_size)
        logger.info("Load test data")
        self.val_op = Options(val_fn,
                              vocabulary_size,
                              dictionary=self.op.dictionary)
        self.embedding_dim = embedding_dim
        self.windows_size = windows_size
        self.vocabulary_size = vocabulary_size
        self.batch_size = batch_size
        self.epoch_num = epoch_num
        self.neg_sample_num = neg_sample_num

    def train(self):
        model = skipgram(self.vocabulary_size, self.embedding_dim)
        if torch.cuda.is_available():
            model.cuda()

        #return model
        optimizer = optim.SGD(model.parameters(), lr=0.2)
        for epoch in range(self.epoch_num):
            epoch_start = time.time()
            start = time.time()
            self.op.process = True
            batch_num = 0
            batch_new = 0

            while self.op.process:
                pos_u, pos_v, neg_v = self.op.generate_batch(
                    self.windows_size,
                    self.batch_size,
                    self.neg_sample_num,
                    verbose=False)

                pos_u = Variable(torch.LongTensor(pos_u))
                pos_v = Variable(torch.LongTensor(pos_v))
                neg_v = Variable(torch.LongTensor(neg_v))

                if torch.cuda.is_available():
                    pos_u = pos_u.cuda()
                    pos_v = pos_v.cuda()
                    neg_v = neg_v.cuda()

                optimizer.zero_grad()
                loss = model(pos_u, pos_v, neg_v, self.batch_size)
                loss.backward()
                optimizer.step()

                if batch_num % 2000 == 0:
                    end = time.time()
                    with torch.no_grad():
                        total_val_loss = 0.
                        n_val_batch = 0
                        self.val_op.process = True
                        while self.val_op.process:
                            pos_u, pos_v, neg_v = self.val_op.generate_batch(
                                self.windows_size, self.batch_size,
                                self.neg_sample_num)
                            pos_u = Variable(torch.LongTensor(pos_u))
                            pos_v = Variable(torch.LongTensor(pos_v))
                            neg_v = Variable(torch.LongTensor(neg_v))
                            if torch.cuda.is_available():
                                pos_u = pos_u.cuda()
                                pos_v = pos_v.cuda()
                                neg_v = neg_v.cuda()
                            val_loss = model(pos_u, pos_v, neg_v,
                                             self.batch_size)
                            total_val_loss += val_loss.item()
                            n_val_batch += 1

                    word_embeddings = model.input_embeddings()
                    print('epoch,batch=%2d %5d:  pair/sec = %4.2f loss=%4.3f val_loss=%4.3f\r'\
                      %(epoch, batch_num, (batch_num-batch_new)*self.batch_size/(end-start),loss.item(), total_val_loss/n_val_batch))
                    batch_new = batch_num
                    start = time.time()
                batch_num = batch_num + 1
            print("epoch stat, time: %.2f, batch_num: %d" %
                  (time.time() - epoch_start, batch_num))
            torch.save(model.state_dict(),
                       './tmp/skipgram.epoch{}'.format(epoch))
        print("Optimization Finished!")